From d413cdf81401d2ff31cb9fedd51b17813b7065f8 Mon Sep 17 00:00:00 2001
From: "Jorj X. McKie" <jorj.x.mckie@outlook.de>
Date: Sat, 14 Feb 2026 13:41:01 -0400
Subject: [PATCH] Support Tesseract "-c" options

Full Tesseract "-c" options support:

Use the new "options" fields for MuPDF's Tesseract invocation.
---
 src/__init__.py | 10 +++++++---
 src/utils.py    |  9 ++++++++-
 2 files changed, 15 insertions(+), 4 deletions(-)
diff --git a/src/__init__.py b/src/__init__.py
index e4eb5ca54..f37942221 100644
--- a/src/__init__.py
+++ b/src/__init__.py
@@ -13700,13 +13700,15 @@ def n2(self):
             return self.n
         return mupdf.fz_pixmap_components(self.this)
 
-    def pdfocr_save(self, filename, compress=1, language=None, tessdata=None):
+    def pdfocr_save(self, filename, compress=1, language=None, tessdata=None, options=""):
         '''
         Save pixmap as an OCR-ed PDF page.
         '''
         tessdata = get_tessdata(tessdata)
         opts = mupdf.FzPdfocrOptions()
         opts.compress = compress
+        if options:
+            opts.options = options
         if language:
             opts.language_set2( language)
         if tessdata:
@@ -13721,7 +13723,7 @@ def pdfocr_save(self, filename, compress=1, language=None, tessdata=None):
             finally:
                 out.fz_close_output()   # Avoid MuPDF warning.
 
-    def pdfocr_tobytes(self, compress=True, language="eng", tessdata=None):
+    def pdfocr_tobytes(self, compress=True, language="eng", tessdata=None, options=""):
         """Save pixmap as an OCR-ed PDF page.
 
         Args:
@@ -13731,6 +13733,8 @@ def pdfocr_tobytes(self, compress=True, language="eng", tessdata=None):
             tessdata: (str) folder name of Tesseract's language support. If None
                     we use environment variable TESSDATA_PREFIX or search for
                     Tesseract installation.
+            options: (str) any Tesseract comma-separated options that can be
+                     given using Tesseract's "-c" CLI parameter.
         Notes:
             On failure, make sure Tesseract is installed and you have set
             <tessdata> or environment variable "TESSDATA_PREFIX" to the folder
@@ -13739,7 +13743,7 @@ def pdfocr_tobytes(self, compress=True, language="eng", tessdata=None):
         tessdata = get_tessdata(tessdata)
         from io import BytesIO
         bio = BytesIO()
-        self.pdfocr_save(bio, compress=compress, language=language, tessdata=tessdata)
+        self.pdfocr_save(bio, compress=compress, language=language, tessdata=tessdata, options=options)
         return bio.getvalue()
 
     def pil_image(self):
diff --git a/src/utils.py b/src/utils.py
index 8e295989a..d44b3ade0 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -321,6 +321,7 @@ def get_textpage_ocr(
     dpi: int = 72,
     full: bool = False,
     tessdata: str = None,
+    options="",
 ) -> pymupdf.TextPage:
     """Create a Textpage from combined results of normal and OCR text parsing.
 
@@ -329,6 +330,8 @@ def get_textpage_ocr(
         language: (str) specify expected language(s). Default is "eng" (English).
         dpi: (int) resolution in dpi, default 72.
         full: (bool) whether to OCR the full page image, or only its images (default)
+        options: (str) any Tesseract comma-separated options that can be given
+                 using Tesseract's "-c" CLI parameter.
     """
     pymupdf.CheckParent(page)
     tessdata = pymupdf.get_tessdata(tessdata)
@@ -343,6 +346,7 @@ def full_ocr(page, dpi, language, flags):
                     compress=False,
                     language=language,
                     tessdata=tessdata,
+                    options=options,
                     ),
                 )
         ocr_page = ocr_pdf.load_page(0)
@@ -376,7 +380,10 @@ def full_ocr(page, dpi, language, flags):
                 pix = pymupdf.Pixmap(pix, 0)
             imgdoc = pymupdf.Document(
                     "pdf",
-                    pix.pdfocr_tobytes(language=language, tessdata=tessdata),
+                    pix.pdfocr_tobytes(
+                        language=language,
+                        tessdata=tessdata,
+                        options=options),
                     )  # pdf with OCRed page
             imgpage = imgdoc.load_page(0)  # read image as a page
             pix = None