Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion ocr_service/api/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,9 @@ def process(request: Request, file: UploadFile | None = File(default=None)) -> O
log.debug(f"Stream size: {len(stream)} bytes")

ocr_skipped = bool(doc_metadata.get("ocr_skipped"))
code = 200 if len(output_text) > 0 or not stream or ocr_skipped else 500
text_length = doc_metadata.get("text_length")

code = 200 if text_length > 0 or not stream or ocr_skipped else 500

response: dict[Any, Any] = {
"result": build_response(
Expand Down
4 changes: 4 additions & 0 deletions ocr_service/processor/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,7 @@ def prepare(self, ctx: ProcessContext) -> None:
if settings.OPERATION_MODE == "NO_OCR":
ctx.output_text = self._xml_to_text(ctx)
ctx.metadata["pages"] = 1
ctx.metadata["ocr_skipped"] = True
else:
self.log.info("Detected XML content; converting to PDF...")
ctx.pdf_stream = self._preprocess_xml_to_pdf(
Expand All @@ -400,6 +401,7 @@ def prepare(self, ctx: ProcessContext) -> None:
self.log.info("Detected HTML content, handling via fallback, NO_OCR mode")
ctx.output_text = self._extract_text_fallback(ctx.stream, is_html=True)
ctx.metadata["pages"] = 1
ctx.metadata["ocr_skipped"] = True
else:
self.log.info("Detected HTML content; converting to PDF via unoserver/LO")
ctx.pdf_stream = self._preprocess_doc(ctx.stream, file_name=ctx.file_name)
Expand All @@ -409,6 +411,7 @@ def prepare(self, ctx: ProcessContext) -> None:
ctx.output_text = self._extract_text_fallback(ctx.stream, is_rtf=True)
ctx.metadata["pages"] = 1
ctx.metadata["content-type"] = "text/plain"
ctx.metadata["ocr_skipped"] = True
else:
ctx.pdf_stream = self._preprocess_doc(ctx.stream, file_name=ctx.file_name)

Expand All @@ -422,6 +425,7 @@ def prepare(self, ctx: ProcessContext) -> None:
ctx.output_text = ctx.stream.decode("utf-8", "ignore")
ctx.metadata["pages"] = 1
ctx.metadata["content-type"] = "text/plain"
ctx.metadata["ocr_skipped"] = True

else:
self.log.info("Unknown file type; attempting to convert to PDF via unoserver/LO")
Expand Down
2 changes: 1 addition & 1 deletion ocr_service/processor/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def process_stream(self, stream: bytes, file_name: str = "") -> tuple[str, dict]
end_time = time.time()
elapsed_time = float(round(float(end_time - start_time), 4))
doc_metadata["elapsed_time"] = elapsed_time

doc_metadata["text_length"] = len(output_text)
self.log.info("Finished processing file: " + file_name + " | Elapsed time: " + str(elapsed_time)
+ " seconds")
except Exception:
Expand Down
3 changes: 2 additions & 1 deletion ocr_service/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,8 @@ def is_file_content_plain_text(stream: bytes, threshold: float = 0.95) -> bool:

# If it can't be decoded as UTF-8 at all, treat as binary
try:
sample.decode("utf-8")
# old documents have \x95 and \x96 characters. Use replace
sample.decode("utf-8", errors="replace")
except UnicodeDecodeError:
return False

Expand Down