From d413cdf81401d2ff31cb9fedd51b17813b7065f8 Mon Sep 17 00:00:00 2001 From: "Jorj X. McKie" Date: Sat, 14 Feb 2026 13:41:01 -0400 Subject: [PATCH] Support Tesseract "-c" options Full Tesseract "-c" options support: Use the new "options" fields for MuPDF's Tesseract invocation. --- src/__init__.py | 10 +++++++--- src/utils.py | 9 ++++++++- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/src/__init__.py b/src/__init__.py index e4eb5ca54..f37942221 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -13700,13 +13700,15 @@ def n2(self): return self.n return mupdf.fz_pixmap_components(self.this) - def pdfocr_save(self, filename, compress=1, language=None, tessdata=None): + def pdfocr_save(self, filename, compress=1, language=None, tessdata=None, options=""): ''' Save pixmap as an OCR-ed PDF page. ''' tessdata = get_tessdata(tessdata) opts = mupdf.FzPdfocrOptions() opts.compress = compress + if options: + opts.options = options if language: opts.language_set2( language) if tessdata: @@ -13721,7 +13723,7 @@ def pdfocr_save(self, filename, compress=1, language=None, tessdata=None): finally: out.fz_close_output() # Avoid MuPDF warning. - def pdfocr_tobytes(self, compress=True, language="eng", tessdata=None): + def pdfocr_tobytes(self, compress=True, language="eng", tessdata=None, options=""): """Save pixmap as an OCR-ed PDF page. Args: @@ -13731,6 +13733,8 @@ def pdfocr_tobytes(self, compress=True, language="eng", tessdata=None): tessdata: (str) folder name of Tesseract's language support. If None we use environment variable TESSDATA_PREFIX or search for Tesseract installation. + options: (str) any Tesseract comma-separated options that can be + given using Tesseract's "-c" CLI parameter. Notes: On failure, make sure Tesseract is installed and you have set or environment variable "TESSDATA_PREFIX" to the folder @@ -13739,7 +13743,7 @@ def pdfocr_tobytes(self, compress=True, language="eng", tessdata=None): tessdata = get_tessdata(tessdata) from io import BytesIO bio = BytesIO() - self.pdfocr_save(bio, compress=compress, language=language, tessdata=tessdata) + self.pdfocr_save(bio, compress=compress, language=language, tessdata=tessdata, options=options) return bio.getvalue() def pil_image(self): diff --git a/src/utils.py b/src/utils.py index 8e295989a..d44b3ade0 100644 --- a/src/utils.py +++ b/src/utils.py @@ -321,6 +321,7 @@ def get_textpage_ocr( dpi: int = 72, full: bool = False, tessdata: str = None, + options="", ) -> pymupdf.TextPage: """Create a Textpage from combined results of normal and OCR text parsing. @@ -329,6 +330,8 @@ def get_textpage_ocr( language: (str) specify expected language(s). Default is "eng" (English). dpi: (int) resolution in dpi, default 72. full: (bool) whether to OCR the full page image, or only its images (default) + options: (str) any Tesseract comma-separated options that can be given + using Tesseract's "-c" CLI parameter. """ pymupdf.CheckParent(page) tessdata = pymupdf.get_tessdata(tessdata) @@ -343,6 +346,7 @@ def full_ocr(page, dpi, language, flags): compress=False, language=language, tessdata=tessdata, + options=options, ), ) ocr_page = ocr_pdf.load_page(0) @@ -376,7 +380,10 @@ def full_ocr(page, dpi, language, flags): pix = pymupdf.Pixmap(pix, 0) imgdoc = pymupdf.Document( "pdf", - pix.pdfocr_tobytes(language=language, tessdata=tessdata), + pix.pdfocr_tobytes( + language=language, + tessdata=tessdata, + options=options), ) # pdf with OCRed page imgpage = imgdoc.load_page(0) # read image as a page pix = None