diff --git a/colrev/packages/files_dir/src/files_dir.py b/colrev/packages/files_dir/src/files_dir.py index 2be696e3d..7f1da9a69 100644 --- a/colrev/packages/files_dir/src/files_dir.py +++ b/colrev/packages/files_dir/src/files_dir.py @@ -547,6 +547,10 @@ def _index_pdf( self.review_manager.logger.info(f" extract metadata from {file_path}") try: if not self.review_manager.settings.is_curated_masterdata_repo(): + # TODO : skip cover page + # extract _get_coverpages in colrev/packages/remove_coverpage/src/remove_cover_page.py + # use get_colrev_pdf_id() with custom_page = 1 (for coverpages) + # retrieve_based_on_colrev_pdf_id colrev_pdf_id = colrev.record.record.Record.get_colrev_pdf_id( pdf_path=file_path_abs diff --git a/colrev/record/record.py b/colrev/record/record.py index f31eab750..fbb805798 100644 --- a/colrev/record/record.py +++ b/colrev/record/record.py @@ -616,10 +616,13 @@ def get_colrev_id( def get_colrev_pdf_id( cls, pdf_path: Path, + custom_page: int = 0, ) -> str: # pragma: no cover """Generate the colrev_pdf_id""" - return colrev.record.record_identifier.get_colrev_pdf_id(pdf_path) + return colrev.record.record_identifier.get_colrev_pdf_id( + pdf_path, custom_page=custom_page + ) def get_toc_key(self) -> str: """Get the record's toc-key""" diff --git a/colrev/record/record_identifier.py b/colrev/record/record_identifier.py index bec49b00a..f0ba0e278 100644 --- a/colrev/record/record_identifier.py +++ b/colrev/record/record_identifier.py @@ -192,11 +192,12 @@ def get_colrev_id(record: colrev.record.record.Record, *, assume_complete: bool) return srep -def _get_colrev_pdf_id_cpid2(pdf_path: Path) -> str: +def _get_colrev_pdf_id_cpid2(pdf_path: Path, custom_page: int) -> str: with tempfile.NamedTemporaryFile(suffix=".png") as temp_file: file_name = temp_file.name try: doc: pymupdf.Document = pymupdf.open(pdf_path) + # TODO : select custom_page page = next(iter(doc)) # get the first page pix = page.get_pixmap(dpi=200) pix.save(file_name) # store image as a PNG @@ -214,7 +215,9 @@ def _get_colrev_pdf_id_cpid2(pdf_path: Path) -> str: raise colrev_exceptions.PDFHashError(path=pdf_path) from exc -def get_colrev_pdf_id(pdf_path: Path, *, cpid_version: str = "cpid2") -> str: +def get_colrev_pdf_id( + pdf_path: Path, *, cpid_version: str = "cpid2", custom_page: int +) -> str: """Get the PDF hash""" pdf_path = pdf_path.resolve() @@ -223,7 +226,7 @@ def get_colrev_pdf_id(pdf_path: Path, *, cpid_version: str = "cpid2") -> str: raise colrev_exceptions.InvalidPDFException(path=pdf_path) if cpid_version == "cpid2": - return _get_colrev_pdf_id_cpid2(pdf_path) + return _get_colrev_pdf_id_cpid2(pdf_path, custom_page=custom_page) raise NotImplementedError