Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion app/src/notebooks/editor/view.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use std::{
sync::atomic::{AtomicBool, Ordering},
};

use markdown_parser::{parse_html, parse_markdown, FormattedText};
use markdown_parser::{parse_html, parse_markdown, parse_markdown_with_gfm_tables, FormattedText};
use pathfinder_geometry::vector::vec2f;
use string_offset::CharOffset;
use warp_editor::{
Expand Down Expand Up @@ -1965,6 +1965,11 @@ impl RichTextEditorView {

fn paste_content(&mut self, content: ClipboardContent, ctx: &mut ViewContext<Self>) {
let parsed_html = content.html.and_then(|text| parse_html(text.as_str()).ok());
let parse_markdown = if FeatureFlag::MarkdownTables.is_enabled() {
parse_markdown_with_gfm_tables
} else {
parse_markdown
};

// If we failed to get the html string, try parsing plain text into markdown first.
// If that failed as well, fall back to pasting plain text string.
Expand Down
106 changes: 103 additions & 3 deletions crates/markdown_parser/src/html_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,17 @@ use html5ever::{
use markup5ever_rcdom::{Node, NodeData, RcDom};

use crate::{
CodeBlockText, FormattedIndentTextInline, FormattedTaskList, FormattedText,
CodeBlockText, FormattedIndentTextInline, FormattedTable, FormattedTaskList, FormattedText,
FormattedTextFragment, FormattedTextHeader, FormattedTextInline, FormattedTextLine,
FormattedTextStyles, Hyperlink, OrderedFormattedIndentTextInline,
FormattedTextStyles, Hyperlink, OrderedFormattedIndentTextInline, TableAlignment,
markdown_parser::RUNNABLE_BLOCK_MARKDOWN_LANG, weight::CustomWeight,
};

// Top element element tags we are not parsing for right now.
// Note that we have "<b>" here because GDocs always include a top level <b> element to add additional
// GDocs specific meta-data for its rich text content.
const TOP_LEVEL_ELEMENT_TAGS_TO_SKIP: &[&str] = &[
"head", "body", "html", "meta", "table", "b", "div", "ul", "ol", "li", "input",
"head", "body", "html", "meta", "b", "div", "ul", "ol", "li", "input",
];
const PHRASING_ELEMENT_TAGS: &[&str] = &[
"span", "i", "code", "strong", "em", "br", "a", "s", "u", "ins",
Expand Down Expand Up @@ -335,6 +335,7 @@ pub fn parse_html(html: &str) -> Result<FormattedText> {
}),
"br" => FormattedTextLine::LineBreak,
"hr" => FormattedTextLine::HorizontalRule,
"table" => FormattedTextLine::Table(parse_table(node.as_ref())),
_ => {
// Take into consideration the indent level when parsing the nodes.
let parsed_node = parse_pending_inline_nodes(
Expand Down Expand Up @@ -365,6 +366,105 @@ pub fn parse_html(html: &str) -> Result<FormattedText> {

Ok(FormattedText { lines: result })
}
fn parse_table(table: &Node) -> FormattedTable {
let mut row_nodes = Vec::new();
collect_table_rows(table, &mut row_nodes);

let rows = row_nodes
.iter()
.filter_map(|row| {
let cells = collect_table_cells(row)
.into_iter()
.map(|cell| parse_phrasing_content(&cell.children.borrow(), Styling::default()))
.collect::<Vec<_>>();

(!cells.is_empty()).then_some(cells)
})
.collect::<Vec<_>>();

let (headers, rows) = match rows.split_first() {
Some((headers, rows)) => (headers.clone(), rows.to_vec()),
None => (Vec::new(), Vec::new()),
};

let mut alignments = row_nodes
.first()
.map(|row| {
collect_table_cells(row)
.into_iter()
.map(|cell| parse_table_cell_alignment(&cell))
.collect::<Vec<_>>()
})
.unwrap_or_default();

let mut table = FormattedTable {
headers,
alignments: std::mem::take(&mut alignments),
rows,
};
table.normalize_shape();
table
}

fn collect_table_rows(node: &Node, rows: &mut Vec<Rc<Node>>) {
for child in node.children.borrow().iter() {
if element_name(child) == Some("tr") {
rows.push(Rc::clone(child));
continue;
}

collect_table_rows(child.as_ref(), rows);
}
}

fn collect_table_cells(row: &Node) -> Vec<Rc<Node>> {
row.children
.borrow()
.iter()
.filter(|child| matches!(element_name(child), Some("td" | "th")))
.cloned()
.collect()
}

fn element_name(node: &Rc<Node>) -> Option<&str> {
match &node.data {
NodeData::Element { name, .. } => Some(name.local.as_ref()),
NodeData::Document
| NodeData::Doctype { .. }
| NodeData::Text { .. }
| NodeData::Comment { .. }
| NodeData::ProcessingInstruction { .. } => None,
}
}

fn parse_table_cell_alignment(cell: &Node) -> TableAlignment {
let NodeData::Element { attrs, .. } = &cell.data else {
return TableAlignment::Left;
};
let attrs = attrs.borrow();

if let Some(alignment) = get_attribute(&attrs, "align").and_then(alignment_from_html_value) {
return alignment;
}

get_attribute(&attrs, "style")
.map(parse_style_into_dict)
.and_then(|style| {
style
.get("text-align")
.and_then(|alignment| alignment_from_html_value(alignment))
})
.unwrap_or(TableAlignment::Left)
}

fn alignment_from_html_value(value: &str) -> Option<TableAlignment> {
match value.trim().to_ascii_lowercase().as_str() {
"left" | "start" => Some(TableAlignment::Left),
"center" => Some(TableAlignment::Center),
"right" | "end" => Some(TableAlignment::Right),
_ => None,
}
}

// Push all pending inline nodes into the result. Take into consideration the active indent level.
fn parse_pending_inline_nodes(
Expand Down
27 changes: 15 additions & 12 deletions crates/markdown_parser/src/html_parser_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,6 @@ fn test_transform_non_breaking_spaces() {
assert_eq!(test_parse_html(safari_html), expected_text);
}

// TODO: remove/update this test when we eventually support these HTML element types!
#[test]
fn test_unsupported_html_types() {
assert_eq!(
Expand All @@ -203,21 +202,25 @@ fn test_unsupported_html_types() {
])
]
);
}

#[test]
fn test_parse_table() {
assert_eq!(
test_parse_html(
"<meta charset='utf-8'><table><thead><tr><th>Text 1</th><th>Text 2</th></tr></thead><tbody><tr><td>Test</td><td>Test</td></tr></tbody></table>"
r#"<meta charset='utf-8'><table><thead><tr><th align="left">Text 1</th><th style="text-align: right"><strong>Text 2</strong></th></tr></thead><tbody><tr><td>Test</td><td><em>Test</em></td></tr></tbody></table>"#
),
vec![
FormattedTextLine::Line(vec![
FormattedTextFragment::plain_text("Text 1"),
FormattedTextFragment::plain_text("Text 2")
]),
FormattedTextLine::Line(vec![
FormattedTextFragment::plain_text("Test"),
FormattedTextFragment::plain_text("Test")
])
]
vec![FormattedTextLine::Table(FormattedTable {
headers: vec![
vec![FormattedTextFragment::plain_text("Text 1")],
vec![FormattedTextFragment::bold("Text 2")]
],
alignments: vec![TableAlignment::Left, TableAlignment::Right],
rows: vec![vec![
vec![FormattedTextFragment::plain_text("Test")],
vec![FormattedTextFragment::italic("Test")]
]]
})]
);
}

Expand Down
Loading