Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 54 additions & 11 deletions app/parse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

#include "parse.h"

#include <sstream>

void set_loglevel(std::string level)
{
if(level=="info")
Expand All @@ -26,9 +28,52 @@ void set_loglevel(std::string level)
}
}

std::optional<std::vector<int>> parse_page_selection(const cxxopts::ParseResult& result)
{
const bool has_page = result.count("page") && result["page"].as<int>() != -1;
const bool has_page_range = result.count("page-range");

if(has_page && has_page_range)
{
throw std::runtime_error("Use either --page or --page-range, not both");
}

if(has_page)
{
return std::vector<int>{result["page"].as<int>()};
}

if(!has_page_range)
{
return std::nullopt;
}

std::string raw = result["page-range"].as<std::string>();
size_t dash = raw.find('-');
if(dash == std::string::npos)
{
throw std::runtime_error("Page range must have form start-end");
}

int start = std::stoi(raw.substr(0, dash));
int end = std::stoi(raw.substr(dash + 1));
if(end < start)
{
throw std::runtime_error("Page range end must be >= start");
}

std::vector<int> pages;
pages.reserve(static_cast<size_t>(end - start + 1));
for(int page = start; page <= end; ++page)
{
pages.push_back(page);
}
return pages;
}

nlohmann::json create_config(std::filesystem::path ifile,
std::filesystem::path ofile,
int page=-1,
std::optional<std::vector<int>> pages=std::nullopt,
std::filesystem::path pdf_resource_dir="../docling_parse/pdf_resources/")
{
nlohmann::json config = nlohmann::json::object({});
Expand All @@ -46,10 +91,9 @@ nlohmann::json create_config(std::filesystem::path ifile,
task["output"] = ofile;
}

if(page!=-1)
if(pages.has_value())
{
std::vector<int> pages = {page};
task["page-numbers"] = pages;
task["page-numbers"] = *pages;
}

tasks.push_back(task);
Expand All @@ -76,6 +120,7 @@ int main(int argc, char* argv[]) {
("c,config", "Config file", cxxopts::value<std::string>())
("create-config", "Create config file", cxxopts::value<std::string>())
("p,page", "Pages to process (default: -1 for all)", cxxopts::value<int>()->default_value("-1"))
("page-range", "Inclusive page range to process, e.g. 10-20", cxxopts::value<std::string>())
("password", "Password for accessing encrypted, password-protected files", cxxopts::value<std::string>())
("o,output", "Output file", cxxopts::value<std::string>())
("export-images", "Export images to directory", cxxopts::value<std::string>())
Expand Down Expand Up @@ -181,15 +226,14 @@ int main(int argc, char* argv[]) {
std::string ifile = result["input"].as<std::string>();
std::string ofile = "";

int page = result["page"].as<int>();
LOG_F(INFO, "Page to process: %d", page);
auto pages = parse_page_selection(result);

if (result.count("output")) {
ofile = result["output"].as<std::string>();
LOG_F(INFO, "Output file: %s", ofile.c_str());
}

auto config = create_config(ifile, ofile, page);
auto config = create_config(ifile, ofile, pages);
LOG_S(INFO) << "config: \n" << config.dump(2);
}

Expand All @@ -199,8 +243,7 @@ int main(int argc, char* argv[]) {
std::string ifile = result["input"].as<std::string>();
std::string ofile = ifile+".json";

int page = result["page"].as<int>();
LOG_F(INFO, "Page to process: %d", page);
auto pages = parse_page_selection(result);

if (result.count("output")) {
ofile = result["output"].as<std::string>();
Expand All @@ -210,7 +253,7 @@ int main(int argc, char* argv[]) {
LOG_F(INFO, "No output file found, defaulting to %s", ofile.c_str());
}

auto config = create_config(ifile, ofile, page);
auto config = create_config(ifile, ofile, pages);
LOG_S(INFO) << "config: \n" << config.dump(2);
if (result.count("password")) {
config["password"] = result["password"].as<std::string>();
Expand Down Expand Up @@ -241,7 +284,7 @@ int main(int argc, char* argv[]) {

if (result.count("export-images")) {
std::string images_dir = result["export-images"].as<std::string>();
parser.export_images(images_dir, page);
parser.export_images(images_dir, result["page"].as<int>());
}

return 0;
Expand Down
1 change: 1 addition & 0 deletions app/pybind_parse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,7 @@ PYBIND11_MODULE(pdf_parsers, m) {
.def_readwrite("line_space_width_factor_for_merge", &pdflib::decode_config::line_space_width_factor_for_merge)
.def_readwrite("line_space_width_factor_for_merge_with_space", &pdflib::decode_config::line_space_width_factor_for_merge_with_space)
.def_readwrite("do_thread_safe", &pdflib::decode_config::do_thread_safe)
.def_readwrite("release_native_memory_every_n_pages", &pdflib::decode_config::release_native_memory_every_n_pages)
.def_readwrite("keep_glyphs", &pdflib::decode_config::keep_glyphs)
.def_readwrite("keep_qpdf_warnings", &pdflib::decode_config::keep_qpdf_warnings);

Expand Down
3 changes: 3 additions & 0 deletions docling_parse/pdf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1132,6 +1132,9 @@ def _copy_decode_config(src: DecodePageConfig) -> DecodePageConfig:
src.line_space_width_factor_for_merge_with_space
)
dst.do_thread_safe = src.do_thread_safe
dst.release_native_memory_every_n_pages = (
src.release_native_memory_every_n_pages
)
dst.keep_glyphs = src.keep_glyphs
dst.keep_qpdf_warnings = src.keep_qpdf_warnings
return dst
Expand Down
14 changes: 3 additions & 11 deletions perf/run_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@

What this script does:
1) Reads a CSV and finds the top N slowest successful pages.
2) Loads those documents with docling-parse (typed or json pipeline selection).
2) Loads those documents with docling-parse via the typed pipeline.
3) Retrieves detailed stage timings from the underlying parser.
4) Outputs results based on mode:
--top: CSV with static timings per pdf-page
--nth: Table with all timings (static + dynamic) showing sum, avg, std, count

Usage examples:
python perf/run_analysis.py perf/results/perf_docling_*.csv --top 25 --mode typed --loglevel fatal
python perf/run_analysis.py perf/results/perf_docling_20250915-151237.csv --mode json --nth 7
python perf/run_analysis.py perf/results/perf_docling_*.csv --top 25 --loglevel fatal
python perf/run_analysis.py perf/results/perf_docling_20250915-151237.csv --nth 7
"""

from __future__ import annotations
Expand Down Expand Up @@ -130,7 +130,6 @@ def extract_timings_for_page(
def analyze_pages(
csv_path: Path,
top_n: int | None,
mode: str,
min_sec: float | None = None,
*,
nth: int | None = None,
Expand Down Expand Up @@ -327,12 +326,6 @@ def main(argv: List[str]) -> int:
default=None,
help="Optional minimum elapsed_sec threshold",
)
ap.add_argument(
"--mode",
choices=["typed", "json"],
default="typed",
help="Pipeline to trigger before fetching timings",
)
ap.add_argument(
"--loglevel",
choices=["fatal", "error", "warning", "info"],
Expand Down Expand Up @@ -366,7 +359,6 @@ def main(argv: List[str]) -> int:
pages = analyze_pages(
csv_path,
top_n=args.top,
mode=args.mode,
min_sec=args.min_sec,
nth=args.nth,
loglevel=args.loglevel,
Expand Down
1 change: 1 addition & 0 deletions perf/run_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ class PageRow:

KNOWN_PARSERS = [
# Known keys from perf/run_perf.py
"docling-threaded",
"docling",
"pdfplumber",
"pypdfium2",
Expand Down
Loading
Loading