Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
218 changes: 153 additions & 65 deletions kolibrie/src/sparql_database.rs
Original file line number Diff line number Diff line change
Expand Up @@ -727,24 +727,24 @@ impl SparqlDatabase {
}
}

// New parse_turtle function
pub fn parse_turtle(&mut self, turtle_data: &str) {
let lines = turtle_data.lines();

for line in lines {
let line = line.trim();
pub fn parse_turtle(&mut self, turtle_data: &str) {
for raw_line in turtle_data.lines() {
let line = raw_line.trim();

// Skip empty lines and comments
if line.is_empty() || line.starts_with("#") {
continue;
}

// Prefix declarations
if line.starts_with("@prefix") || line.starts_with("PREFIX") {
let prefix_line = line
.trim_start_matches("@prefix")
.trim_start_matches("PREFIX")
.trim_end_matches('.')
.trim();

let parts: Vec<&str> = prefix_line.split_whitespace().collect();
if parts.len() >= 2 {
let prefix = parts[0].trim_end_matches(':').to_string();
Expand All @@ -756,83 +756,153 @@ impl SparqlDatabase {
continue;
}

// Use quoted-triple-aware tokenization
let line_no_dot = line.trim_end_matches('.').trim();
let tokens = Self::tokenize_turtle_star_line(line_no_dot);

if tokens.len() >= 3 {
let subject_raw = &tokens[0];
let predicate_raw = &tokens[1];
let object_raw = tokens[2..].join(" ");

// Check for annotation syntax {| ... |}
let (object_part, annotations) = if let Some(ann_start) = object_raw.find("{|") {
let obj = object_raw[..ann_start].trim().to_string();
// Extract annotation content between {| and |}
if let Some(ann_end) = object_raw.find("|}") {
let ann_content = object_raw[ann_start + 2..ann_end].trim();
let ann_parts: Vec<&str> = ann_content.splitn(2, char::is_whitespace).collect();
if ann_parts.len() == 2 {
(obj, vec![(ann_parts[0].to_string(), ann_parts[1].to_string())])
// Tokenize, but keep ; , . as delimiters only when outside URIs, literals, and quoted triples.
let tokens = Self::tokenize_turtle_star_line(line);

let mut subject_raw: Option<String> = None;
let mut predicate_raw: Option<String> = None;
let mut object_tokens: Vec<String> = Vec::new();

let mut expect_subject = true;
let mut expect_predicate = false;
let mut expect_object = false;

let flush_object = |this: &mut Self,
subject_raw: &Option<String>,
predicate_raw: &Option<String>,
object_tokens: &mut Vec<String>| {
if let (Some(s_raw), Some(p_raw)) = (subject_raw.as_ref(), predicate_raw.as_ref()) {
if object_tokens.is_empty() {
return;
}

let object_raw = object_tokens.join(" ");

// Handle annotation syntax {| ... |}
let (object_part, annotations) = if let Some(ann_start) = object_raw.find("{|") {
let obj = object_raw[..ann_start].trim().to_string();

if let Some(ann_end) = object_raw.find("|}") {
let ann_content = object_raw[ann_start + 2..ann_end].trim();
let ann_parts: Vec<&str> =
ann_content.splitn(2, char::is_whitespace).collect();

if ann_parts.len() == 2 {
(obj, vec![(ann_parts[0].to_string(), ann_parts[1].to_string())])
} else {
(obj, vec![])
}
} else {
(obj, vec![])
(object_raw, vec![])
}
} else {
(object_raw, vec![])
};

let subject = this.resolve_query_term(&Self::clean_turtle_term(s_raw), &this.prefixes);
let predicate = this.resolve_query_term(&Self::clean_turtle_term(p_raw), &this.prefixes);
let object = this.resolve_query_term(&Self::clean_turtle_term(&object_part), &this.prefixes);

// Emit the main triple
if subject.starts_with("<<") || object.starts_with("<<") {
let s_id = this.encode_term_star(&subject);
let p_id = this.encode_term_star(&predicate);
let o_id = this.encode_term_star(&object);
let triple = Triple {
subject: s_id,
predicate: p_id,
object: o_id,
};
this.add_triple(triple);
} else {
let mut dict = this.dictionary.write().unwrap();
let triple = Triple {
subject: dict.encode(&subject),
predicate: dict.encode(&predicate),
object: dict.encode(&object),
};
drop(dict);
this.add_triple(triple);
}
} else {
(object_raw, vec![])
};

let subject = self.resolve_query_term(&Self::clean_turtle_term(subject_raw), &self.prefixes);
let predicate = self.resolve_query_term(&Self::clean_turtle_term(predicate_raw), &self.prefixes);
let object = self.resolve_query_term(&Self::clean_turtle_term(&object_part), &self.prefixes);

// Check if subject or object is a quoted triple
if subject.starts_with("<<") || object.starts_with("<<") {
let s_id = self.encode_term_star(&subject);
let p_id = self.encode_term_star(&predicate);
let o_id = self.encode_term_star(&object);
let triple = Triple { subject: s_id, predicate: p_id, object: o_id };
self.add_triple(triple);
} else {
let mut dict = self.dictionary.write().unwrap();
let triple = Triple {
subject: dict.encode(&subject),
predicate: dict.encode(&predicate),
object: dict.encode(&object),
};
drop(dict);
self.add_triple(triple);
// Emit annotation triples, if any
for (ann_pred, ann_obj) in &annotations {
let qt_str = format!("<< {} {} {} >>", subject, predicate, object);
let qt_id = this.encode_term_star(&qt_str);

let ann_p_id = this.encode_term_star(
&this.resolve_query_term(&Self::clean_turtle_term(ann_pred), &this.prefixes),
);
let ann_o_id = this.encode_term_star(
&this.resolve_query_term(&Self::clean_turtle_term(ann_obj), &this.prefixes),
);

let ann_triple = Triple {
subject: qt_id,
predicate: ann_p_id,
object: ann_o_id,
};
this.add_triple(ann_triple);
}

object_tokens.clear();
}
};

// Handle annotations: emit additional triples with << s p o >> as subject
for (ann_pred, ann_obj) in &annotations {
let qt_str = format!("<< {} {} {} >>", subject, predicate, object);
let qt_id = self.encode_term_star(&qt_str);
let ann_p_id = self.encode_term_star(
&self.resolve_query_term(&Self::clean_turtle_term(ann_pred), &self.prefixes),
);
let ann_o_id = self.encode_term_star(
&self.resolve_query_term(&Self::clean_turtle_term(ann_obj), &self.prefixes),
);
let ann_triple = Triple { subject: qt_id, predicate: ann_p_id, object: ann_o_id };
self.add_triple(ann_triple);
for token in tokens {
match token.as_str() {
"." => {
flush_object(self, &subject_raw, &predicate_raw, &mut object_tokens);
subject_raw = None;
predicate_raw = None;
expect_subject = true;
expect_predicate = false;
expect_object = false;
}
";" => {
flush_object(self, &subject_raw, &predicate_raw, &mut object_tokens);
predicate_raw = None;
expect_predicate = true;
expect_object = false;
}
"," => {
flush_object(self, &subject_raw, &predicate_raw, &mut object_tokens);
expect_object = true;
}
_ => {
if expect_subject {
subject_raw = Some(token);
expect_subject = false;
expect_predicate = true;
} else if expect_predicate {
predicate_raw = Some(token);
expect_predicate = false;
expect_object = true;
} else if expect_object {
object_tokens.push(token);
} else {
// Fallback for slightly malformed input
object_tokens.push(token);
}
}
}
} else {
eprintln!("Skipping invalid line: {}", line);
}

// Flush any trailing object if the line does not end with '.'
flush_object(self, &subject_raw, &predicate_raw, &mut object_tokens);
}
}

/// Tokenize a Turtle-star line, keeping `<< ... >>` as a single token.
/// Tokenize a Turtle-star line, keeping `<< ... >>` and punctuation structure intact.
fn tokenize_turtle_star_line(line: &str) -> Vec<String> {
let mut tokens = Vec::new();
let mut current = String::new();
let mut depth = 0i32;
let mut in_uri = false;
let mut in_literal = false;

let mut depth = 0i32; // quoted-triple nesting depth
let mut in_uri = false; // inside <...>
let mut in_literal = false; // inside "..."
let mut escaped = false;

let mut chars = line.chars().peekable();

while let Some(ch) = chars.next() {
Expand All @@ -841,19 +911,23 @@ impl SparqlDatabase {
escaped = false;
continue;
}

match ch {
'\\' if in_literal => {
current.push(ch);
escaped = true;
}

'"' if !in_uri && depth == 0 => {
in_literal = !in_literal;
current.push(ch);
}

'"' if depth > 0 => {
in_literal = !in_literal;
current.push(ch);
}

'<' if !in_literal => {
if chars.peek() == Some(&'<') && !in_uri {
current.push(ch);
Expand All @@ -870,6 +944,7 @@ impl SparqlDatabase {
current.push(ch);
}
}

'>' if !in_literal => {
if depth > 0 && !in_uri {
current.push(ch);
Expand All @@ -892,22 +967,35 @@ impl SparqlDatabase {
current.push(ch);
}
}

';' | ',' | '.' if depth == 0 && !in_uri && !in_literal => {
let trimmed = current.trim().to_string();
if !trimmed.is_empty() {
tokens.push(trimmed);
current.clear();
}
tokens.push(ch.to_string());
}

' ' | '\t' | '\n' | '\r' if depth == 0 && !in_uri && !in_literal => {
let trimmed = current.trim().to_string();
if !trimmed.is_empty() {
tokens.push(trimmed);
current.clear();
}
}

_ => {
current.push(ch);
}
}
}

let trimmed = current.trim().to_string();
if !trimmed.is_empty() {
tokens.push(trimmed);
}

tokens
}

Expand Down