Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13700,13 +13700,15 @@ def n2(self):
return self.n
return mupdf.fz_pixmap_components(self.this)

def pdfocr_save(self, filename, compress=1, language=None, tessdata=None):
def pdfocr_save(self, filename, compress=1, language=None, tessdata=None, options=""):
'''
Save pixmap as an OCR-ed PDF page.
'''
tessdata = get_tessdata(tessdata)
opts = mupdf.FzPdfocrOptions()
opts.compress = compress
if options:
opts.options = options
if language:
opts.language_set2( language)
if tessdata:
Expand All @@ -13721,7 +13723,7 @@ def pdfocr_save(self, filename, compress=1, language=None, tessdata=None):
finally:
out.fz_close_output() # Avoid MuPDF warning.

def pdfocr_tobytes(self, compress=True, language="eng", tessdata=None):
def pdfocr_tobytes(self, compress=True, language="eng", tessdata=None, options=""):
"""Save pixmap as an OCR-ed PDF page.

Args:
Expand All @@ -13731,6 +13733,8 @@ def pdfocr_tobytes(self, compress=True, language="eng", tessdata=None):
tessdata: (str) folder name of Tesseract's language support. If None
we use environment variable TESSDATA_PREFIX or search for
Tesseract installation.
options: (str) any Tesseract comma-separated options that can be
given using Tesseract's "-c" CLI parameter.
Notes:
On failure, make sure Tesseract is installed and you have set
<tessdata> or environment variable "TESSDATA_PREFIX" to the folder
Expand All @@ -13739,7 +13743,7 @@ def pdfocr_tobytes(self, compress=True, language="eng", tessdata=None):
tessdata = get_tessdata(tessdata)
from io import BytesIO
bio = BytesIO()
self.pdfocr_save(bio, compress=compress, language=language, tessdata=tessdata)
self.pdfocr_save(bio, compress=compress, language=language, tessdata=tessdata, options=options)
return bio.getvalue()

def pil_image(self):
Expand Down
9 changes: 8 additions & 1 deletion src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,7 @@ def get_textpage_ocr(
dpi: int = 72,
full: bool = False,
tessdata: str = None,
options="",
) -> pymupdf.TextPage:
"""Create a Textpage from combined results of normal and OCR text parsing.

Expand All @@ -329,6 +330,8 @@ def get_textpage_ocr(
language: (str) specify expected language(s). Default is "eng" (English).
dpi: (int) resolution in dpi, default 72.
full: (bool) whether to OCR the full page image, or only its images (default)
options: (str) any Tesseract comma-separated options that can be given
using Tesseract's "-c" CLI parameter.
"""
pymupdf.CheckParent(page)
tessdata = pymupdf.get_tessdata(tessdata)
Expand All @@ -343,6 +346,7 @@ def full_ocr(page, dpi, language, flags):
compress=False,
language=language,
tessdata=tessdata,
options=options,
),
)
ocr_page = ocr_pdf.load_page(0)
Expand Down Expand Up @@ -376,7 +380,10 @@ def full_ocr(page, dpi, language, flags):
pix = pymupdf.Pixmap(pix, 0)
imgdoc = pymupdf.Document(
"pdf",
pix.pdfocr_tobytes(language=language, tessdata=tessdata),
pix.pdfocr_tobytes(
language=language,
tessdata=tessdata,
options=options),
) # pdf with OCRed page
imgpage = imgdoc.load_page(0) # read image as a page
pix = None
Expand Down