From 8ce1bb7c6cc7440368eaa8e74fe3de28d5a0bf64 Mon Sep 17 00:00:00 2001 From: Me3sP Date: Wed, 20 May 2026 19:17:58 +0200 Subject: [PATCH] Route TOC-without-page-numbers documents to the correct strategy tree_parser only had two branches: a TOC with page numbers, or everything else. A document with a printed TOC that lists no page numbers fell into the else branch and was processed with process_no_toc, regenerating the structure from scratch and ignoring the existing TOC entirely. process_toc_no_page_numbers was therefore unreachable as a primary strategy and only ran as a fallback from process_toc_with_page_numbers. Add the missing branch so a TOC with no page numbers is dispatched to process_toc_no_page_numbers directly. Also forward start_index from meta_processor into process_toc_no_page_numbers, which previously relied on the default and would index incorrectly for non-top-level nodes. --- pageindex/page_index.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pageindex/page_index.py b/pageindex/page_index.py index 9004309fb..41f813ed4 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -963,7 +963,7 @@ async def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=N if mode == 'process_toc_with_page_numbers': toc_with_page_number = process_toc_with_page_numbers(toc_content, toc_page_list, page_list, toc_check_page_num=opt.toc_check_page_num, model=opt.model, logger=logger) elif mode == 'process_toc_no_page_numbers': - toc_with_page_number = process_toc_no_page_numbers(toc_content, toc_page_list, page_list, model=opt.model, logger=logger) + toc_with_page_number = process_toc_no_page_numbers(toc_content, toc_page_list, page_list,start_index=start_index, model=opt.model, logger=logger) else: toc_with_page_number = process_no_toc(page_list, start_index=start_index, model=opt.model, logger=logger) @@ -1039,6 +1039,15 @@ async def tree_parser(page_list, opt, doc=None, logger=None): toc_page_list=check_toc_result['toc_page_list'], opt=opt, logger=logger) + elif check_toc_result.get("toc_content") and check_toc_result["toc_content"].strip() and check_toc_result["page_index_given_in_toc"] == "no": + toc_with_page_number = await meta_processor( + page_list, + mode='process_toc_no_page_numbers', + start_index=1, + toc_content=check_toc_result['toc_content'], + toc_page_list=check_toc_result['toc_page_list'], + opt=opt, + logger=logger) else: toc_with_page_number = await meta_processor( page_list,