Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 85 additions & 6 deletions src/cortex-cli/src/scrape_cmd/html.rs
Original file line number Diff line number Diff line change
Expand Up @@ -380,15 +380,11 @@ fn process_node_to_markdown(
// Tables
"table" => {
output.push_str("\n\n");
process_node_to_markdown(
output.push_str(&render_table_markdown(
element_ref,
output,
list_depth,
in_pre,
in_code,
no_images,
no_links,
);
));
output.push_str("\n\n");
}
"thead" | "tbody" | "tfoot" => {
Expand Down Expand Up @@ -448,6 +444,89 @@ fn process_node_to_markdown(
}
}

fn render_table_markdown(table: scraper::ElementRef, no_images: bool, no_links: bool) -> String {
let mut rows = Vec::new();
collect_table_rows(table, &mut rows, no_images, no_links);

if rows.is_empty() {
return String::new();
}

let mut output = String::new();
for (row_index, row) in rows.iter().enumerate() {
output.push_str("| ");
output.push_str(&row.join(" | "));
output.push_str(" |\n");

if row_index == 0 {
output.push_str("| ");
output.push_str(&vec!["---"; row.len().max(1)].join(" | "));
output.push_str(" |\n");
}
}

output
}

fn collect_table_rows(
node: scraper::ElementRef,
rows: &mut Vec<Vec<String>>,
no_images: bool,
no_links: bool,
) {
for child in node.children() {
if let Some(element_ref) = scraper::ElementRef::wrap(child) {
match element_ref.value().name.local.as_ref() {
"tr" => rows.push(collect_table_cells(element_ref, no_images, no_links)),
"thead" | "tbody" | "tfoot" => {
collect_table_rows(element_ref, rows, no_images, no_links);
}
_ => {}
}
}
}
}

fn collect_table_cells(row: scraper::ElementRef, no_images: bool, no_links: bool) -> Vec<String> {
let mut cells = Vec::new();

for child in row.children() {
if let Some(element_ref) = scraper::ElementRef::wrap(child)
&& matches!(element_ref.value().name.local.as_ref(), "th" | "td")
{
cells.push(render_table_cell(element_ref, no_images, no_links));
}
}

cells
}

fn render_table_cell(cell: scraper::ElementRef, no_images: bool, no_links: bool) -> String {
let mut output = String::new();
let mut list_depth = 0;
let mut in_pre = false;
let mut in_code = false;

process_node_to_markdown(
cell,
&mut output,
&mut list_depth,
&mut in_pre,
&mut in_code,
no_images,
no_links,
);

clean_table_cell(&output)
}

fn clean_table_cell(cell: &str) -> String {
normalize_whitespace(&cell.replace('\n', " "))
.replace('|', "\\|")
.trim()
.to_string()
}

/// Convert HTML to plain text.
pub fn html_to_text(html: &str) -> String {
let cleaned = remove_unwanted_elements(html);
Expand Down
19 changes: 19 additions & 0 deletions src/cortex-cli/src/scrape_cmd/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,25 @@ mod tests {
assert!(!md_no_images.contains("!["));
}

#[test]
fn test_html_to_markdown_table_includes_separator() {
let html = r#"
<table>
<thead>
<tr><th>Name</th><th>Age</th></tr>
</thead>
<tbody>
<tr><td>Alice</td><td>30</td></tr>
</tbody>
</table>
"#;
let md = html_to_markdown(html, false, false);

assert!(md.contains("| Name | Age |"));
assert!(md.contains("| --- | --- |"));
assert!(md.contains("| Alice | 30 |"));
}

#[test]
fn test_html_to_text() {
let html = "<h1>Title</h1><p>Hello <strong>world</strong>!</p>";
Expand Down