From d42c3dffcc4fc623f6eebf1e5994207f2be07ca2 Mon Sep 17 00:00:00 2001 From: OnlyYu1996 <1158673577@qq.com> Date: Sun, 17 May 2026 06:02:13 +0800 Subject: [PATCH] fix(scrape): emit markdown table separators --- src/cortex-cli/src/scrape_cmd/html.rs | 91 ++++++++++++++++++++++++-- src/cortex-cli/src/scrape_cmd/tests.rs | 19 ++++++ 2 files changed, 104 insertions(+), 6 deletions(-) diff --git a/src/cortex-cli/src/scrape_cmd/html.rs b/src/cortex-cli/src/scrape_cmd/html.rs index 4e28f6768..d7ab3e06a 100644 --- a/src/cortex-cli/src/scrape_cmd/html.rs +++ b/src/cortex-cli/src/scrape_cmd/html.rs @@ -380,15 +380,11 @@ fn process_node_to_markdown( // Tables "table" => { output.push_str("\n\n"); - process_node_to_markdown( + output.push_str(&render_table_markdown( element_ref, - output, - list_depth, - in_pre, - in_code, no_images, no_links, - ); + )); output.push_str("\n\n"); } "thead" | "tbody" | "tfoot" => { @@ -448,6 +444,89 @@ fn process_node_to_markdown( } } +fn render_table_markdown(table: scraper::ElementRef, no_images: bool, no_links: bool) -> String { + let mut rows = Vec::new(); + collect_table_rows(table, &mut rows, no_images, no_links); + + if rows.is_empty() { + return String::new(); + } + + let mut output = String::new(); + for (row_index, row) in rows.iter().enumerate() { + output.push_str("| "); + output.push_str(&row.join(" | ")); + output.push_str(" |\n"); + + if row_index == 0 { + output.push_str("| "); + output.push_str(&vec!["---"; row.len().max(1)].join(" | ")); + output.push_str(" |\n"); + } + } + + output +} + +fn collect_table_rows( + node: scraper::ElementRef, + rows: &mut Vec>, + no_images: bool, + no_links: bool, +) { + for child in node.children() { + if let Some(element_ref) = scraper::ElementRef::wrap(child) { + match element_ref.value().name.local.as_ref() { + "tr" => rows.push(collect_table_cells(element_ref, no_images, no_links)), + "thead" | "tbody" | "tfoot" => { + collect_table_rows(element_ref, rows, no_images, no_links); + } + _ => {} + } + } + } +} + +fn collect_table_cells(row: scraper::ElementRef, no_images: bool, no_links: bool) -> Vec { + let mut cells = Vec::new(); + + for child in row.children() { + if let Some(element_ref) = scraper::ElementRef::wrap(child) + && matches!(element_ref.value().name.local.as_ref(), "th" | "td") + { + cells.push(render_table_cell(element_ref, no_images, no_links)); + } + } + + cells +} + +fn render_table_cell(cell: scraper::ElementRef, no_images: bool, no_links: bool) -> String { + let mut output = String::new(); + let mut list_depth = 0; + let mut in_pre = false; + let mut in_code = false; + + process_node_to_markdown( + cell, + &mut output, + &mut list_depth, + &mut in_pre, + &mut in_code, + no_images, + no_links, + ); + + clean_table_cell(&output) +} + +fn clean_table_cell(cell: &str) -> String { + normalize_whitespace(&cell.replace('\n', " ")) + .replace('|', "\\|") + .trim() + .to_string() +} + /// Convert HTML to plain text. pub fn html_to_text(html: &str) -> String { let cleaned = remove_unwanted_elements(html); diff --git a/src/cortex-cli/src/scrape_cmd/tests.rs b/src/cortex-cli/src/scrape_cmd/tests.rs index bd8b39509..453044c22 100644 --- a/src/cortex-cli/src/scrape_cmd/tests.rs +++ b/src/cortex-cli/src/scrape_cmd/tests.rs @@ -41,6 +41,25 @@ mod tests { assert!(!md_no_images.contains("![")); } + #[test] + fn test_html_to_markdown_table_includes_separator() { + let html = r#" + + + + + + + +
NameAge
Alice30
+ "#; + let md = html_to_markdown(html, false, false); + + assert!(md.contains("| Name | Age |")); + assert!(md.contains("| --- | --- |")); + assert!(md.contains("| Alice | 30 |")); + } + #[test] fn test_html_to_text() { let html = "

Title

Hello world!

";