From 52cf4f010d5155bcfe71d492764515cd0835baa7 Mon Sep 17 00:00:00 2001 From: Richard Beare Date: Wed, 15 Apr 2026 17:16:21 +1000 Subject: [PATCH 1/3] added extra points where ocr_skipped is set --- ocr_service/processor/converter.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ocr_service/processor/converter.py b/ocr_service/processor/converter.py index 35b596d..e967a84 100755 --- a/ocr_service/processor/converter.py +++ b/ocr_service/processor/converter.py @@ -378,6 +378,7 @@ def prepare(self, ctx: ProcessContext) -> None: if settings.OPERATION_MODE == "NO_OCR": ctx.output_text = self._xml_to_text(ctx) ctx.metadata["pages"] = 1 + ctx.metadata["ocr_skipped"] = True else: self.log.info("Detected XML content; converting to PDF...") ctx.pdf_stream = self._preprocess_xml_to_pdf( @@ -400,6 +401,7 @@ def prepare(self, ctx: ProcessContext) -> None: self.log.info("Detected HTML content, handling via fallback, NO_OCR mode") ctx.output_text = self._extract_text_fallback(ctx.stream, is_html=True) ctx.metadata["pages"] = 1 + ctx.metadata["ocr_skipped"] = True else: self.log.info("Detected HTML content; converting to PDF via unoserver/LO") ctx.pdf_stream = self._preprocess_doc(ctx.stream, file_name=ctx.file_name) @@ -409,6 +411,7 @@ def prepare(self, ctx: ProcessContext) -> None: ctx.output_text = self._extract_text_fallback(ctx.stream, is_rtf=True) ctx.metadata["pages"] = 1 ctx.metadata["content-type"] = "text/plain" + ctx.metadata["ocr_skipped"] = True else: ctx.pdf_stream = self._preprocess_doc(ctx.stream, file_name=ctx.file_name) @@ -422,6 +425,7 @@ def prepare(self, ctx: ProcessContext) -> None: ctx.output_text = ctx.stream.decode("utf-8", "ignore") ctx.metadata["pages"] = 1 ctx.metadata["content-type"] = "text/plain" + ctx.metadata["ocr_skipped"] = True else: self.log.info("Unknown file type; attempting to convert to PDF via unoserver/LO") From 1423489916ee790eb0440bef2c41621440a9a81a Mon Sep 17 00:00:00 2001 From: Richard Beare Date: Thu, 16 Apr 2026 07:53:12 +1000 Subject: [PATCH 2/3] added text_length field to metadata --- ocr_service/api/process.py | 4 +++- ocr_service/processor/processor.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/ocr_service/api/process.py b/ocr_service/api/process.py index 813b31a..d22bf11 100755 --- a/ocr_service/api/process.py +++ b/ocr_service/api/process.py @@ -113,7 +113,9 @@ def process(request: Request, file: UploadFile | None = File(default=None)) -> O log.debug(f"Stream size: {len(stream)} bytes") ocr_skipped = bool(doc_metadata.get("ocr_skipped")) - code = 200 if len(output_text) > 0 or not stream or ocr_skipped else 500 + text_length = doc_metadata.get("text_length") + + code = 200 if text_length > 0 or not stream or ocr_skipped else 500 response: dict[Any, Any] = { "result": build_response( diff --git a/ocr_service/processor/processor.py b/ocr_service/processor/processor.py index d187f38..477b614 100755 --- a/ocr_service/processor/processor.py +++ b/ocr_service/processor/processor.py @@ -87,7 +87,7 @@ def process_stream(self, stream: bytes, file_name: str = "") -> tuple[str, dict] end_time = time.time() elapsed_time = float(round(float(end_time - start_time), 4)) doc_metadata["elapsed_time"] = elapsed_time - + doc_metadata["text_length"] = len(output_text) self.log.info("Finished processing file: " + file_name + " | Elapsed time: " + str(elapsed_time) + " seconds") except Exception: From d51ffaf0285a06152dcb75c39f223e5d2dfdf4cf Mon Sep 17 00:00:00 2001 From: Richard Beare Date: Fri, 17 Apr 2026 14:05:28 +1000 Subject: [PATCH 3/3] more robust decode of plain text --- ocr_service/utils/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocr_service/utils/utils.py b/ocr_service/utils/utils.py index 7efd2ed..897356f 100755 --- a/ocr_service/utils/utils.py +++ b/ocr_service/utils/utils.py @@ -195,7 +195,8 @@ def is_file_content_plain_text(stream: bytes, threshold: float = 0.95) -> bool: # If it can't be decoded as UTF-8 at all, treat as binary try: - sample.decode("utf-8") + # old documents have \x95 and \x96 characters. Use replace + sample.decode("utf-8", errors="replace") except UnicodeDecodeError: return False