From 8e1dcb0acb1ec6e618870c25828427146fab3873 Mon Sep 17 00:00:00 2001
From: ennajari <abdellahennajari2018@gmail.com>
Date: Sat, 28 Mar 2026 12:51:06 +0100
Subject: [PATCH] feat: improve CLI with output format, verbose mode, and API
 key validation

- Add --output-format flag (json/markdown) for human-readable output
- Add --output-dir flag for custom output directory
- Add --verbose flag for detailed LLM request logging
- Add automatic API key validation with provider-specific error messages
- Add tqdm progress bar (optional, graceful fallback)
- Add full docstrings and type hints to all helper functions
- Update README with new CLI options documentation
---
 README.md        |  36 +++-
 requirements.txt |   1 +
 run_pageindex.py | 464 +++++++++++++++++++++++++++++++++++------------
 3 files changed, 379 insertions(+), 122 deletions(-)
diff --git a/README.md b/README.md
index a32de7739..64b1b6ebb 100644
--- a/README.md
+++ b/README.md
@@ -169,13 +169,35 @@ python3 run_pageindex.py --pdf_path /path/to/your/document.pdf
 You can customize the processing with additional optional arguments:
 
 ```
---model                 LLM model to use (default: gpt-4o-2024-11-20)
---toc-check-pages       Pages to check for table of contents (default: 20)
---max-pages-per-node    Max pages per node (default: 10)
---max-tokens-per-node   Max tokens per node (default: 20000)
---if-add-node-id        Add node ID (yes/no, default: yes)
---if-add-node-summary   Add node summary (yes/no, default: yes)
---if-add-doc-description Add doc description (yes/no, default: yes)
+--model                  LLM model to use (default: gpt-4o-2024-11-20)
+                         Supports any LiteLLM provider prefix, e.g.:
+                           anthropic/claude-sonnet-4-6
+                           gemini/gemini-2.0-flash
+--toc-check-pages        Pages to check for table of contents (default: 20)
+--max-pages-per-node     Max pages per node (default: 10)
+--max-tokens-per-node    Max tokens per node (default: 20000)
+--if-add-node-id         Add node ID (yes/no, default: yes)
+--if-add-node-summary    Add node summary (yes/no, default: yes)
+--if-add-doc-description Add doc description (yes/no, default: no)
+--output-format          Output format: "json" (default) or "markdown"
+                         "markdown" writes a human-readable outline with
+                         headings and summaries instead of raw JSON.
+--output-dir             Directory to write the output file (default: ./results)
+--verbose                Enable verbose logging (LiteLLM requests, retries,
+                         token counts)
+```
+
+**Examples:**
+
+```bash
+# Use Anthropic Claude instead of OpenAI
+python3 run_pageindex.py --pdf_path report.pdf --model anthropic/claude-sonnet-4-6
+
+# Export a human-readable Markdown outline instead of JSON
+python3 run_pageindex.py --pdf_path report.pdf --output-format markdown
+
+# Save results to a custom directory with verbose logging
+python3 run_pageindex.py --pdf_path report.pdf --output-dir ./my_results --verbose
 ```
 </details>
 
diff --git a/requirements.txt b/requirements.txt
index 613e92161..7c0170adb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,4 @@ pymupdf==1.26.4
 PyPDF2==3.0.1
 python-dotenv==1.1.0
 pyyaml==6.0.2
+tqdm>=4.66.0  # optional: progress bars in run_pageindex.py (--output-format markdown)
diff --git a/run_pageindex.py b/run_pageindex.py
index 673439d89..f448ccab5 100644
--- a/run_pageindex.py
+++ b/run_pageindex.py
@@ -1,134 +1,368 @@
+"""
+run_pageindex.py — CLI for PageIndex document tree generation.
+
+Usage examples:
+  python run_pageindex.py --pdf_path report.pdf
+  python run_pageindex.py --md_path notes.md --output-format markdown
+  python run_pageindex.py --pdf_path paper.pdf --model anthropic/claude-sonnet-4-6 --verbose
+  python run_pageindex.py --pdf_path paper.pdf --output-dir ./my_results
+"""
 import argparse
-import os
+import asyncio
 import json
-from pageindex import *
+import logging
+import os
+import sys
+from typing import Optional
+
+from pageindex import page_index_main
 from pageindex.page_index_md import md_to_tree
 from pageindex.utils import ConfigLoader
 
+try:
+    from tqdm import tqdm as _tqdm
+    HAS_TQDM = True
+except ImportError:
+    HAS_TQDM = False
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _check_api_key(model: Optional[str]) -> None:
+    """Verify that the required API key is present for the chosen model.
+
+    Detects the provider from the LiteLLM model string prefix and checks
+    the corresponding environment variable.  Exits with a helpful message
+    if the key is missing.
+
+    Args:
+        model: LiteLLM model string, e.g. ``"gpt-4o-2024-11-20"``,
+               ``"anthropic/claude-sonnet-4-6"``, ``"gemini/gemini-2.0-flash"``.
+               ``None`` falls back to the OpenAI check.
+    """
+    m = (model or "").lower()
+
+    if m.startswith("anthropic/") or m.startswith("claude"):
+        if not os.getenv("ANTHROPIC_API_KEY"):
+            sys.exit(
+                "Error: ANTHROPIC_API_KEY is not set.\n"
+                "  Set it with:  export ANTHROPIC_API_KEY=sk-ant-...\n"
+                "  Or add it to a .env file in the project root.\n"
+                "  Get a key:    https://console.anthropic.com/settings/keys"
+            )
+    elif m.startswith("gemini/"):
+        if not (os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")):
+            sys.exit(
+                "Error: GEMINI_API_KEY (or GOOGLE_API_KEY) is not set.\n"
+                "  Set it with:  export GEMINI_API_KEY=...\n"
+                "  Or add it to a .env file in the project root.\n"
+                "  Get a key:    https://ai.google.dev/gemini-api/docs/api-key"
+            )
+    else:
+        # Default provider: OpenAI (also accepts CHATGPT_API_KEY alias)
+        if not (os.getenv("OPENAI_API_KEY") or os.getenv("CHATGPT_API_KEY")):
+            sys.exit(
+                "Error: OPENAI_API_KEY is not set.\n"
+                "  Set it with:  export OPENAI_API_KEY=sk-...\n"
+                "  Or add it to a .env file in the project root.\n"
+                "  Get a key:    https://platform.openai.com/api-keys\n"
+                "\n"
+                "  Using a different provider? Pass --model with a prefix:\n"
+                "    --model anthropic/claude-sonnet-4-6\n"
+                "    --model gemini/gemini-2.0-flash"
+            )
+
+
+def _render_node_md(node: dict, depth: int, lines: list) -> None:
+    """Recursively render one tree node as Markdown headings.
+
+    Args:
+        node:  Tree node dict (may contain title, summary, start_index,
+               end_index, line_num, nodes).
+        depth: Current heading depth — 0 produces ``##``, 1 produces ``###``,
+               capped at ``######``.
+        lines: Accumulator list that receives rendered Markdown lines.
+    """
+    heading = "#" * min(depth + 2, 6)
+    title = node.get("title", "Untitled")
+
+    # Location hint: page range for PDF, line number for Markdown
+    start = node.get("start_index") or node.get("line_num") or ""
+    end = node.get("end_index", "")
+    if start and end:
+        loc = f" *(pages {start}–{end})*"
+    elif start:
+        loc = f" *(line {start})*"
+    else:
+        loc = ""
+
+    lines.append(f"{heading} {title}{loc}")
+    if node.get("summary"):
+        lines.append(f"\n{node['summary']}")
+    lines.append("")
+
+    for child in node.get("nodes") or []:
+        _render_node_md(child, depth + 1, lines)
+
+
+def structure_to_markdown(result: dict) -> str:
+    """Convert a PageIndex tree structure dict to a Markdown document outline.
+
+    Produces a readable outline with headings, page/line location hints, and
+    LLM-generated summaries (when present in the structure).
+
+    Args:
+        result: The dict returned by :func:`page_index_main` or
+                :func:`md_to_tree`.  Expected top-level keys:
+                ``doc_name``, ``doc_description`` (optional),
+                ``structure`` (list of tree nodes).
+
+    Returns:
+        A Markdown string representing the full document tree.
+    """
+    lines: list = []
+
+    doc_name = result.get("doc_name", "Document")
+    lines.append(f"# {doc_name}")
+    lines.append("")
+
+    if result.get("doc_description"):
+        lines.append(f"> {result['doc_description']}")
+        lines.append("")
+
+    structure = result.get("structure", [])
+    top_nodes = (
+        _tqdm(structure, desc="Rendering nodes", unit="node")
+        if HAS_TQDM
+        else structure
+    )
+    for node in top_nodes:
+        _render_node_md(node, depth=0, lines=lines)
+
+    return "\n".join(lines)
+
+
+def _save_result(result: dict, base_name: str, output_dir: str, fmt: str) -> str:
+    """Persist the indexing result to disk in the requested format.
+
+    Args:
+        result:     The tree structure dict.
+        base_name:  Stem of the source file (used for the output filename).
+        output_dir: Directory to write the output file (created if absent).
+        fmt:        ``"json"`` (default) or ``"markdown"``.
+
+    Returns:
+        Absolute path to the written output file.
+    """
+    os.makedirs(output_dir, exist_ok=True)
+
+    if fmt == "markdown":
+        output_file = os.path.join(output_dir, f"{base_name}_structure.md")
+        content = structure_to_markdown(result)
+        with open(output_file, "w", encoding="utf-8") as f:
+            f.write(content)
+    else:
+        output_file = os.path.join(output_dir, f"{base_name}_structure.json")
+        with open(output_file, "w", encoding="utf-8") as f:
+            json.dump(result, f, indent=2, ensure_ascii=False)
+
+    return os.path.abspath(output_file)
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+def _build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description=(
+            "Process a PDF or Markdown document and generate a PageIndex tree structure."
+        ),
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""\
+examples:
+  python run_pageindex.py --pdf_path report.pdf
+  python run_pageindex.py --md_path notes.md --output-format markdown
+  python run_pageindex.py --pdf_path paper.pdf --model anthropic/claude-sonnet-4-6 --verbose
+  python run_pageindex.py --pdf_path paper.pdf --output-dir ./my_results
+""",
+    )
+
+    # ---- Input ----
+    parser.add_argument("--pdf_path", type=str, help="Path to the PDF file")
+    parser.add_argument("--md_path", type=str, help="Path to the Markdown file")
+
+    # ---- Model ----
+    parser.add_argument(
+        "--model", type=str, default=None,
+        help=(
+            "LiteLLM model string (overrides config.yaml). "
+            "Examples: gpt-4o-2024-11-20, anthropic/claude-sonnet-4-6, "
+            "gemini/gemini-2.0-flash"
+        ),
+    )
+
+    # ---- PDF-specific ----
+    parser.add_argument(
+        "--toc-check-pages", type=int, default=None,
+        help="Number of pages to scan for a table of contents (PDF only, default: 20)",
+    )
+    parser.add_argument(
+        "--max-pages-per-node", type=int, default=None,
+        help="Maximum pages per tree node (PDF only, default: 10)",
+    )
+    parser.add_argument(
+        "--max-tokens-per-node", type=int, default=None,
+        help="Maximum tokens per tree node (PDF only, default: 20000)",
+    )
+
+    # ---- Feature flags ----
+    parser.add_argument(
+        "--if-add-node-id", type=str, default=None,
+        help='Add numeric IDs to each node ("yes"/"no")',
+    )
+    parser.add_argument(
+        "--if-add-node-summary", type=str, default=None,
+        help='Generate LLM summaries for each node ("yes"/"no")',
+    )
+    parser.add_argument(
+        "--if-add-doc-description", type=str, default=None,
+        help='Generate a one-sentence document description ("yes"/"no")',
+    )
+    parser.add_argument(
+        "--if-add-node-text", type=str, default=None,
+        help='Embed raw page text in each node ("yes"/"no")',
+    )
+
+    # ---- Markdown-specific ----
+    parser.add_argument(
+        "--if-thinning", type=str, default="no",
+        help='Apply tree thinning for sparse Markdown docs ("yes"/"no", Markdown only)',
+    )
+    parser.add_argument(
+        "--thinning-threshold", type=int, default=5000,
+        help="Minimum token count threshold for thinning (Markdown only, default: 5000)",
+    )
+    parser.add_argument(
+        "--summary-token-threshold", type=int, default=200,
+        help=(
+            "Node token count above which a summary is generated "
+            "(Markdown only, default: 200)"
+        ),
+    )
+
+    # ---- Output ----
+    parser.add_argument(
+        "--output-format",
+        type=str, default="json", choices=["json", "markdown"],
+        help=(
+            'Output format (default: "json"). '
+            '"json" writes a structured tree; '
+            '"markdown" writes a human-readable outline with headings and summaries.'
+        ),
+    )
+    parser.add_argument(
+        "--output-dir", type=str, default="./results",
+        help="Directory to write the output file (default: ./results)",
+    )
+
+    # ---- Logging ----
+    parser.add_argument(
+        "--verbose", action="store_true",
+        help=(
+            "Enable verbose logging: shows LiteLLM requests, retries, "
+            "and token counts during processing."
+        ),
+    )
+
+    return parser
+
+
 if __name__ == "__main__":
-    # Set up argument parser
-    parser = argparse.ArgumentParser(description='Process PDF or Markdown document and generate structure')
-    parser.add_argument('--pdf_path', type=str, help='Path to the PDF file')
-    parser.add_argument('--md_path', type=str, help='Path to the Markdown file')
-
-    parser.add_argument('--model', type=str, default=None, help='Model to use (overrides config.yaml)')
-
-    parser.add_argument('--toc-check-pages', type=int, default=None,
-                      help='Number of pages to check for table of contents (PDF only)')
-    parser.add_argument('--max-pages-per-node', type=int, default=None,
-                      help='Maximum number of pages per node (PDF only)')
-    parser.add_argument('--max-tokens-per-node', type=int, default=None,
-                      help='Maximum number of tokens per node (PDF only)')
-
-    parser.add_argument('--if-add-node-id', type=str, default=None,
-                      help='Whether to add node id to the node')
-    parser.add_argument('--if-add-node-summary', type=str, default=None,
-                      help='Whether to add summary to the node')
-    parser.add_argument('--if-add-doc-description', type=str, default=None,
-                      help='Whether to add doc description to the doc')
-    parser.add_argument('--if-add-node-text', type=str, default=None,
-                      help='Whether to add text to the node')
-                      
-    # Markdown specific arguments
-    parser.add_argument('--if-thinning', type=str, default='no',
-                      help='Whether to apply tree thinning for markdown (markdown only)')
-    parser.add_argument('--thinning-threshold', type=int, default=5000,
-                      help='Minimum token threshold for thinning (markdown only)')
-    parser.add_argument('--summary-token-threshold', type=int, default=200,
-                      help='Token threshold for generating summaries (markdown only)')
+    parser = _build_parser()
     args = parser.parse_args()
-    
-    # Validate that exactly one file type is specified
+
+    # --- Configure logging ---
+    if args.verbose:
+        logging.basicConfig(level=logging.DEBUG, format="%(levelname)s: %(message)s")
+        import litellm as _litellm
+        _litellm.set_verbose = True
+    else:
+        logging.basicConfig(level=logging.WARNING)
+
+    # --- Validate input ---
     if not args.pdf_path and not args.md_path:
-        raise ValueError("Either --pdf_path or --md_path must be specified")
+        parser.error("Either --pdf_path or --md_path must be specified.")
     if args.pdf_path and args.md_path:
-        raise ValueError("Only one of --pdf_path or --md_path can be specified")
-    
+        parser.error("Only one of --pdf_path or --md_path can be specified.")
+
+    # --- Validate API key before doing any work ---
+    _check_api_key(args.model)
+
+    # -----------------------------------------------------------------------
+    # PDF processing
+    # -----------------------------------------------------------------------
     if args.pdf_path:
-        # Validate PDF file
-        if not args.pdf_path.lower().endswith('.pdf'):
-            raise ValueError("PDF file must have .pdf extension")
+        if not args.pdf_path.lower().endswith(".pdf"):
+            parser.error("--pdf_path must point to a .pdf file.")
         if not os.path.isfile(args.pdf_path):
-            raise ValueError(f"PDF file not found: {args.pdf_path}")
-            
-        # Process PDF file
+            parser.error(f"File not found: {args.pdf_path}")
+
         user_opt = {
-            'model': args.model,
-            'toc_check_page_num': args.toc_check_pages,
-            'max_page_num_each_node': args.max_pages_per_node,
-            'max_token_num_each_node': args.max_tokens_per_node,
-            'if_add_node_id': args.if_add_node_id,
-            'if_add_node_summary': args.if_add_node_summary,
-            'if_add_doc_description': args.if_add_doc_description,
-            'if_add_node_text': args.if_add_node_text,
+            "model": args.model,
+            "toc_check_page_num": args.toc_check_pages,
+            "max_page_num_each_node": args.max_pages_per_node,
+            "max_token_num_each_node": args.max_tokens_per_node,
+            "if_add_node_id": args.if_add_node_id,
+            "if_add_node_summary": args.if_add_node_summary,
+            "if_add_doc_description": args.if_add_doc_description,
+            "if_add_node_text": args.if_add_node_text,
         }
         opt = ConfigLoader().load({k: v for k, v in user_opt.items() if v is not None})
 
-        # Process the PDF
-        toc_with_page_number = page_index_main(args.pdf_path, opt)
-        print('Parsing done, saving to file...')
-        
-        # Save results
-        pdf_name = os.path.splitext(os.path.basename(args.pdf_path))[0]    
-        output_dir = './results'
-        output_file = f'{output_dir}/{pdf_name}_structure.json'
-        os.makedirs(output_dir, exist_ok=True)
-        
-        with open(output_file, 'w', encoding='utf-8') as f:
-            json.dump(toc_with_page_number, f, indent=2)
-        
-        print(f'Tree structure saved to: {output_file}')
-            
+        print(f"Processing PDF: {args.pdf_path}")
+        result = page_index_main(args.pdf_path, opt)
+        base_name = os.path.splitext(os.path.basename(args.pdf_path))[0]
+
+    # -----------------------------------------------------------------------
+    # Markdown processing
+    # -----------------------------------------------------------------------
     elif args.md_path:
-        # Validate Markdown file
-        if not args.md_path.lower().endswith(('.md', '.markdown')):
-            raise ValueError("Markdown file must have .md or .markdown extension")
+        if not args.md_path.lower().endswith((".md", ".markdown")):
+            parser.error("--md_path must point to a .md or .markdown file.")
         if not os.path.isfile(args.md_path):
-            raise ValueError(f"Markdown file not found: {args.md_path}")
-            
-        # Process markdown file
-        print('Processing markdown file...')
-        
-        # Process the markdown
-        import asyncio
-        
-        # Use ConfigLoader to get consistent defaults (matching PDF behavior)
-        from pageindex.utils import ConfigLoader
-        config_loader = ConfigLoader()
-        
-        # Create options dict with user args
+            parser.error(f"File not found: {args.md_path}")
+
         user_opt = {
-            'model': args.model,
-            'if_add_node_summary': args.if_add_node_summary,
-            'if_add_doc_description': args.if_add_doc_description,
-            'if_add_node_text': args.if_add_node_text,
-            'if_add_node_id': args.if_add_node_id
+            "model": args.model,
+            "if_add_node_summary": args.if_add_node_summary,
+            "if_add_doc_description": args.if_add_doc_description,
+            "if_add_node_text": args.if_add_node_text,
+            "if_add_node_id": args.if_add_node_id,
         }
-        
-        # Load config with defaults from config.yaml
-        opt = config_loader.load(user_opt)
-        
-        toc_with_page_number = asyncio.run(md_to_tree(
-            md_path=args.md_path,
-            if_thinning=args.if_thinning.lower() == 'yes',
-            min_token_threshold=args.thinning_threshold,
-            if_add_node_summary=opt.if_add_node_summary,
-            summary_token_threshold=args.summary_token_threshold,
-            model=opt.model,
-            if_add_doc_description=opt.if_add_doc_description,
-            if_add_node_text=opt.if_add_node_text,
-            if_add_node_id=opt.if_add_node_id
-        ))
-        
-        print('Parsing done, saving to file...')
-        
-        # Save results
-        md_name = os.path.splitext(os.path.basename(args.md_path))[0]    
-        output_dir = './results'
-        output_file = f'{output_dir}/{md_name}_structure.json'
-        os.makedirs(output_dir, exist_ok=True)
-        
-        with open(output_file, 'w', encoding='utf-8') as f:
-            json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False)
-        
-        print(f'Tree structure saved to: {output_file}')
\ No newline at end of file
+        opt = ConfigLoader().load({k: v for k, v in user_opt.items() if v is not None})
+
+        print(f"Processing Markdown: {args.md_path}")
+        result = asyncio.run(
+            md_to_tree(
+                md_path=args.md_path,
+                if_thinning=args.if_thinning.lower() == "yes",
+                min_token_threshold=args.thinning_threshold,
+                if_add_node_summary=opt.if_add_node_summary,
+                summary_token_threshold=args.summary_token_threshold,
+                model=opt.model,
+                if_add_doc_description=opt.if_add_doc_description,
+                if_add_node_text=opt.if_add_node_text,
+                if_add_node_id=opt.if_add_node_id,
+            )
+        )
+        base_name = os.path.splitext(os.path.basename(args.md_path))[0]
+
+    # -----------------------------------------------------------------------
+    # Save output
+    # -----------------------------------------------------------------------
+    print("Parsing complete. Saving output...")
+    output_file = _save_result(result, base_name, args.output_dir, args.output_format)
+    print(f"Saved to: {output_file}")