From 8edf3a21c524ae97ff02e25f02b8a72b139aee14 Mon Sep 17 00:00:00 2001 From: Mirovh <94124714+Mirovh@users.noreply.github.com> Date: Sun, 15 Feb 2026 21:18:04 +0100 Subject: [PATCH 01/23] Fix test compile error --- kolibrie/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kolibrie/Cargo.toml b/kolibrie/Cargo.toml index b9d83d8..0b7a49e 100644 --- a/kolibrie/Cargo.toml +++ b/kolibrie/Cargo.toml @@ -222,7 +222,7 @@ path = "examples/sparql_syntax/concat/concat.rs" [[example]] name = "volcano" -path = "examples/sparql_syntax/streamertail_optimizer/volcano.rs" +path = "examples/sparql_syntax/volcano_optimizer/volcano.rs" [[example]] name = "select_semicolon" From 0768218deb1671f63623aafeb235875f12bc4a52 Mon Sep 17 00:00:00 2001 From: Mirovh <94124714+Mirovh@users.noreply.github.com> Date: Mon, 16 Feb 2026 01:35:15 +0100 Subject: [PATCH 02/23] Implement generic index functionality --- datalog/src/reasoning.rs | 6 +- datalog/src/reasoning_experimental.rs | 1 + .../knowledge_graph/contradictions.rs | 1 + .../knowledge_graph/deep_taxonomy.rs | 1 + .../knowledge_graph/knowledge_graph.rs | 1 + kolibrie/src/disk_storage/lsm_tree.rs | 17 +- kolibrie/src/disk_storage/sstable.rs | 8 +- kolibrie/src/parser.rs | 1 + kolibrie/src/query_engine.rs | 3 +- kolibrie/src/sparql_database.rs | 12 +- kolibrie/src/storage_manager.rs | 1 + .../hexastore.rs} | 1065 ++++++++--------- shared/src/index_manager/mod.rs | 100 ++ shared/src/index_manager/pso_single.rs | 314 +++++ shared/src/join_algorithm.rs | 48 +- 15 files changed, 979 insertions(+), 600 deletions(-) rename shared/src/{index_manager.rs => index_manager/hexastore.rs} (86%) create mode 100644 shared/src/index_manager/mod.rs create mode 100644 shared/src/index_manager/pso_single.rs diff --git a/datalog/src/reasoning.rs b/datalog/src/reasoning.rs index 28638db..73fc34a 100644 --- a/datalog/src/reasoning.rs +++ b/datalog/src/reasoning.rs @@ -27,7 +27,7 @@ pub struct Reasoner { pub dictionary: Dictionary, pub rules: Vec, // List of dynamic rules - pub index_manager: UnifiedIndex, + pub index_manager: Box, pub rule_index: RuleIndex, pub constraints: Vec, } @@ -37,7 +37,7 @@ impl Reasoner { Self { dictionary: Dictionary::new(), rules: Vec::new(), - index_manager: UnifiedIndex::new(), + index_manager: Box::new(HexastoreIndex::new()), rule_index: RuleIndex::new(), constraints: Vec::new(), } @@ -669,7 +669,7 @@ impl Reasoner { let repairs = self.compute_repairs(&all_facts); if let Some(best_repair) = repairs.into_iter().max_by_key(|r| r.len()) { // Clear index manager and reinsert repaired facts - self.index_manager = UnifiedIndex::new(); + self.index_manager.clear(); for fact in &best_repair { self.index_manager.insert(fact); } diff --git a/datalog/src/reasoning_experimental.rs b/datalog/src/reasoning_experimental.rs index f19298c..0ba86e2 100644 --- a/datalog/src/reasoning_experimental.rs +++ b/datalog/src/reasoning_experimental.rs @@ -10,6 +10,7 @@ use shared::rule::Rule; use shared::triple::Triple; +use shared::index_manager::TripleIndex; use crate::reasoning::Reasoner; use std::collections::{BTreeMap, HashMap, HashSet}; use shared::terms::Term; diff --git a/kolibrie/examples/sparql_syntax/knowledge_graph/contradictions.rs b/kolibrie/examples/sparql_syntax/knowledge_graph/contradictions.rs index fe7069e..5bf64ff 100644 --- a/kolibrie/examples/sparql_syntax/knowledge_graph/contradictions.rs +++ b/kolibrie/examples/sparql_syntax/knowledge_graph/contradictions.rs @@ -8,6 +8,7 @@ * you can obtain one at https://mozilla.org/MPL/2.0/. */ +use shared::index_manager::TripleIndex; use shared::terms::Term; use shared::rule::Rule; use datalog::reasoning::Reasoner; diff --git a/kolibrie/examples/sparql_syntax/knowledge_graph/deep_taxonomy.rs b/kolibrie/examples/sparql_syntax/knowledge_graph/deep_taxonomy.rs index b0c805d..5acee5a 100644 --- a/kolibrie/examples/sparql_syntax/knowledge_graph/deep_taxonomy.rs +++ b/kolibrie/examples/sparql_syntax/knowledge_graph/deep_taxonomy.rs @@ -1,6 +1,7 @@ use datalog::reasoning::Reasoner; use shared::terms::Term; use shared::rule::Rule; +use shared::index_manager::TripleIndex; use kolibrie::sparql_database::SparqlDatabase; use std::fs; use std::time::Instant; diff --git a/kolibrie/examples/sparql_syntax/knowledge_graph/knowledge_graph.rs b/kolibrie/examples/sparql_syntax/knowledge_graph/knowledge_graph.rs index c45314c..811ac2d 100644 --- a/kolibrie/examples/sparql_syntax/knowledge_graph/knowledge_graph.rs +++ b/kolibrie/examples/sparql_syntax/knowledge_graph/knowledge_graph.rs @@ -11,6 +11,7 @@ use shared::dictionary::Dictionary; use shared::terms::Term; use shared::rule::Rule; +use shared::index_manager::TripleIndex; use datalog::reasoning::*; use datalog::parser_n3_logic::parse_n3_rule; diff --git a/kolibrie/src/disk_storage/lsm_tree.rs b/kolibrie/src/disk_storage/lsm_tree.rs index 75b60d7..83f7240 100644 --- a/kolibrie/src/disk_storage/lsm_tree.rs +++ b/kolibrie/src/disk_storage/lsm_tree.rs @@ -9,7 +9,7 @@ */ use shared::triple::Triple; -use shared::index_manager::UnifiedIndex; +use shared::index_manager::*; use std::collections::VecDeque; use std::sync::{Arc, RwLock, Mutex}; use std::path::PathBuf; @@ -480,16 +480,21 @@ impl LSMTree { } /// Build UnifiedIndex from all data in LSM-Tree - pub fn build_unified_index(&self) -> UnifiedIndex { - let mut index = UnifiedIndex::new(); + pub fn build_unified_index(&self) -> HexastoreIndex { + let mut index = HexastoreIndex::new(); let all_triples = self.get_all_triples(); index.build_from_triples(&all_triples); index } - /// Export to UnifiedIndex for use in SparqlDatabase - pub fn export_to_unified_index(&self) -> UnifiedIndex { - self.build_unified_index() + /// Export as Box for use in SparqlDatabase + pub fn export_to_trait_index(&self) -> Box { + Box::new(self.build_unified_index()) + } + + /// Keep old name for backward compat, now returns boxed + pub fn export_to_unified_index(&self) -> Box { + self.export_to_trait_index() } } diff --git a/kolibrie/src/disk_storage/sstable.rs b/kolibrie/src/disk_storage/sstable.rs index cab3138..f5cedef 100644 --- a/kolibrie/src/disk_storage/sstable.rs +++ b/kolibrie/src/disk_storage/sstable.rs @@ -9,7 +9,7 @@ */ use shared::triple::Triple; -use shared::index_manager::UnifiedIndex; +use shared::index_manager::*; use std::path::{Path, PathBuf}; use std::fs::File; use serde::{Serialize, Deserialize}; @@ -23,7 +23,7 @@ pub struct SSTable { /// Level in LSM tree (0, 1, 2, ...) pub level: usize, /// UnifiedIndex containing all 6 permutations - pub index: UnifiedIndex, + pub index: HexastoreIndex, /// Min and max keys for range queries (optimization) pub min_key: Triple, pub max_key: Triple, @@ -43,7 +43,7 @@ impl SSTable { memtable: &MemTable, data_dir: &Path, ) -> Result { - let mut index = UnifiedIndex::new(); + let mut index = HexastoreIndex::new(); let mut triples: Vec = Vec::new(); // Only include non-deleted triples @@ -90,7 +90,7 @@ impl SSTable { sstables: Vec<&SSTable>, data_dir: &Path, ) -> Result { - let mut merged_index = UnifiedIndex::new(); + let mut merged_index = HexastoreIndex::new(); // Merge all indexes for sstable in &sstables { diff --git a/kolibrie/src/parser.rs b/kolibrie/src/parser.rs index 2bcb3c1..4cc90e0 100644 --- a/kolibrie/src/parser.rs +++ b/kolibrie/src/parser.rs @@ -27,6 +27,7 @@ use shared::rule::FilterCondition; use shared::rule::Rule; use shared::terms::*; use shared::query::*; +use shared::index_manager::TripleIndex; // Add RSP imports use crate::rsp::s2r::{CSPARQLWindow, Report, ReportStrategy, Tick, WindowTriple, ContentContainer}; use crate::rsp::r2s::{Relation2StreamOperator, StreamOperator}; diff --git a/kolibrie/src/query_engine.rs b/kolibrie/src/query_engine.rs index 2a80c2c..4fe9865 100644 --- a/kolibrie/src/query_engine.rs +++ b/kolibrie/src/query_engine.rs @@ -8,6 +8,7 @@ * you can obtain one at https://mozilla.org/MPL/2.0/. */ +use shared::index_manager::TripleIndex; use crate::storage_manager::{StorageManager, StorageBackend, StorageStats}; use crate::storage_trait::{StorageTrait, StorageMode, QueryAnalyzer}; use crate::disk_storage::lsm_tree::LSMConfig; @@ -121,7 +122,7 @@ impl QueryEngine { // Clear memory database self.storage_manager.get_memory_database_mut().triples.clear(); self.storage_manager.get_memory_database_mut().index_manager = - shared::index_manager::UnifiedIndex::new(); + Box::new(shared::index_manager::HexastoreIndex::new()); // Build statistics self.storage_manager.get_memory_database_mut().get_or_build_stats(); diff --git a/kolibrie/src/sparql_database.rs b/kolibrie/src/sparql_database.rs index fd6b454..f35ccbf 100644 --- a/kolibrie/src/sparql_database.rs +++ b/kolibrie/src/sparql_database.rs @@ -19,7 +19,7 @@ use crate::utils::current_timestamp; use crate::utils::ClonableFn; #[cfg(feature = "cuda")] use crate::cuda::cuda_join::*; -use shared::index_manager::UnifiedIndex; +use shared::index_manager::*; use crate::query_builder::QueryBuilder; use crossbeam::channel::unbounded; use crossbeam::scope; @@ -52,7 +52,7 @@ pub struct SparqlDatabase { pub dictionary: Dictionary, pub prefixes: HashMap, pub udfs: HashMap, - pub index_manager: UnifiedIndex, + pub index_manager: Box, pub rule_map: HashMap, pub cached_stats: Option>, } @@ -67,7 +67,7 @@ impl SparqlDatabase { dictionary: Dictionary::new(), prefixes: HashMap::new(), udfs: HashMap::new(), - index_manager: UnifiedIndex::new(), + index_manager: Box::new(HexastoreIndex::new()), rule_map: HashMap::new(), cached_stats: None, } @@ -1484,7 +1484,7 @@ impl SparqlDatabase { dictionary: merged_dictionary, prefixes: self.prefixes.clone(), udfs: HashMap::new(), - index_manager: UnifiedIndex::new(), + index_manager: Box::new(HexastoreIndex::new()), rule_map: HashMap::new(), cached_stats: None, } @@ -1553,7 +1553,7 @@ impl SparqlDatabase { dictionary: self.dictionary.clone(), prefixes: self.prefixes.clone(), udfs: HashMap::new(), - index_manager: UnifiedIndex::new(), + index_manager: Box::new(HexastoreIndex::new()), rule_map: HashMap::new(), cached_stats: None, } @@ -2948,7 +2948,7 @@ impl SparqlDatabase { let partial_indexes: Vec<_> = triples .par_chunks(chunk_size) .map(|chunk| { - let mut local_index = shared::index_manager::UnifiedIndex::new(); + let mut local_index = shared::index_manager::HexastoreIndex::new(); for triple in chunk { local_index.insert(triple); } diff --git a/kolibrie/src/storage_manager.rs b/kolibrie/src/storage_manager.rs index 6b6b1f3..23b6fc7 100644 --- a/kolibrie/src/storage_manager.rs +++ b/kolibrie/src/storage_manager.rs @@ -13,6 +13,7 @@ use crate::disk_storage::lsm_tree::{LSMTree, LSMConfig}; use crate::storage_trait::{StorageTrait, QueryAnalysis, QueryAnalyzer, StorageMode}; use crate::execute_query::{execute_query, execute_query_rayon_parallel2_volcano}; use shared::triple::Triple; +use shared::index_manager::TripleIndex; /// Storage backend type - determines where data is physically stored #[derive(Debug, Clone, PartialEq, Eq)] diff --git a/shared/src/index_manager.rs b/shared/src/index_manager/hexastore.rs similarity index 86% rename from shared/src/index_manager.rs rename to shared/src/index_manager/hexastore.rs index cdc857f..426f01e 100644 --- a/shared/src/index_manager.rs +++ b/shared/src/index_manager/hexastore.rs @@ -1,541 +1,524 @@ -/* - * Copyright © 2024 Volodymyr Kadzhaia - * Copyright © 2024 Pieter Bonte - * KU Leuven — Stream Intelligence Lab, Belgium - * - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this file, - * you can obtain one at https://mozilla.org/MPL/2.0/. - */ - -use serde::{Serialize, Deserialize}; -use std::collections::{HashMap, HashSet}; -use crate::terms::*; -use crate::terms::Term::*; -use crate::triple::Triple; - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct UnifiedIndex { - // The six permutations, using HashMap of HashMap of HashSet. - pub spo: HashMap>>, - pub pos: HashMap>>, - pub osp: HashMap>>, - pub pso: HashMap>>, - pub ops: HashMap>>, - pub sop: HashMap>>, -} - -impl UnifiedIndex { - pub fn new() -> Self { - Self { - spo: HashMap::new(), - pos: HashMap::new(), - osp: HashMap::new(), - pso: HashMap::new(), - ops: HashMap::new(), - sop: HashMap::new(), - } - } - - /// Insert a single triple into all six indexes - pub fn insert(&mut self, triple: &Triple) -> bool { - let Triple { subject: s, predicate: p, object: o } = *triple; - if let Some(pred_map) = self.spo.get(&s) { - if let Some(objects) = pred_map.get(&p) { - if objects.contains(&o) { - return false; // triple already stored - } - } - } - self.spo.entry(s).or_default().entry(p).or_default().insert(o); - self.pos.entry(p).or_default().entry(o).or_default().insert(s); - self.osp.entry(o).or_default().entry(s).or_default().insert(p); - self.pso.entry(p).or_default().entry(s).or_default().insert(o); - self.ops.entry(o).or_default().entry(p).or_default().insert(s); - self.sop.entry(s).or_default().entry(o).or_default().insert(p); - true - } - - /// Delete a single triple from all six indexes - pub fn delete(&mut self, triple: &Triple) -> bool { - let Triple { subject: s, predicate: p, object: o } = *triple; - - let exists = self.spo - .get(&s) - .and_then(|pred_map| pred_map.get(&p)) - .map_or(false, |objects| objects.contains(&o)); - - if !exists { - return false; // triple doesn't exist - } - - // Remove from all six indexes using helper function - remove_from_index(&mut self.spo, s, p, o); - remove_from_index(&mut self.pos, p, o, s); - remove_from_index(&mut self.osp, o, s, p); - remove_from_index(&mut self.pso, p, s, o); - remove_from_index(&mut self.ops, o, p, s); - remove_from_index(&mut self.sop, s, o, p); - true - } - - /// Bulk-build the index from a list of triples - pub fn build_from_triples(&mut self, triples: &[Triple]) { - use rayon::prelude::*; - - self.clear(); - - if triples.is_empty() { - return; - } - - // Pre-allocate with capacity estimates - let capacity = triples.len() / 100; - - self.spo.reserve(capacity); - self.pos.reserve(capacity); - self.osp.reserve(capacity); - self.pso.reserve(capacity); - self.ops.reserve(capacity); - self.sop.reserve(capacity); - - // Build indexes in parallel by creating partial indexes and merging - let num_threads = rayon::current_num_threads(); - let chunk_size = (triples.len() / num_threads).max(10_000); - - let partial_indexes: Vec = triples - .par_chunks(chunk_size) - .map(|chunk| { - let mut local_index = UnifiedIndex::new(); - - // Pre-allocate local index - let local_capacity = chunk.len() / 50; - local_index.spo.reserve(local_capacity); - local_index.pos.reserve(local_capacity); - local_index.osp.reserve(local_capacity); - local_index.pso.reserve(local_capacity); - local_index.ops.reserve(local_capacity); - local_index.sop.reserve(local_capacity); - - // Insert triples into local index - for triple in chunk { - local_index.insert_optimized(triple); - } - - local_index - }) - .collect(); - - // Sequentially merge partial indexes - for partial_index in partial_indexes { - self.merge_from(partial_index); - } - - // Optimize memory layout after building - self.optimize_post_build(); - } - - #[inline] - fn insert_optimized(&mut self, triple: &Triple) -> bool { - let Triple { subject: s, predicate: p, object: o } = *triple; - - // Check for duplicates only in SPO index (most selective) - if let Some(pred_map) = self.spo.get(&s) { - if let Some(objects) = pred_map.get(&p) { - if objects.contains(&o) { - return false; - } - } - } - - // Batch insert into all indexes - self.spo.entry(s).or_insert_with(|| HashMap::with_capacity(8)) - .entry(p).or_insert_with(|| HashSet::with_capacity(16)) - .insert(o); - - self.pos.entry(p).or_insert_with(|| HashMap::with_capacity(16)) - .entry(o).or_insert_with(|| HashSet::with_capacity(8)) - .insert(s); - - self.osp.entry(o).or_insert_with(|| HashMap::with_capacity(8)) - .entry(s).or_insert_with(|| HashSet::with_capacity(16)) - .insert(p); - - self.pso.entry(p).or_insert_with(|| HashMap::with_capacity(16)) - .entry(s).or_insert_with(|| HashSet::with_capacity(8)) - .insert(o); - - self.ops.entry(o).or_insert_with(|| HashMap::with_capacity(16)) - .entry(p).or_insert_with(|| HashSet::with_capacity(8)) - .insert(s); - - self.sop.entry(s).or_insert_with(|| HashMap::with_capacity(8)) - .entry(o).or_insert_with(|| HashSet::with_capacity(16)) - .insert(p); - - true - } - - fn optimize_post_build(&mut self) { - use rayon::prelude::*; - - // Parallelize the optimization of each index - rayon::scope(|s| { - s.spawn(|_| { - // SPO index - self.spo.par_iter_mut().for_each(|(_, pred_map)| { - pred_map.shrink_to_fit(); - pred_map.par_iter_mut().for_each(|(_, obj_set)| { - obj_set.shrink_to_fit(); - }); - }); - self.spo.shrink_to_fit(); - }); - - s.spawn(|_| { - // POS index - self.pos.par_iter_mut().for_each(|(_, obj_map)| { - obj_map.shrink_to_fit(); - obj_map.par_iter_mut().for_each(|(_, subj_set)| { - subj_set.shrink_to_fit(); - }); - }); - self.pos.shrink_to_fit(); - }); - - s.spawn(|_| { - // OSP index - self.osp.par_iter_mut().for_each(|(_, subj_map)| { - subj_map.shrink_to_fit(); - subj_map.par_iter_mut().for_each(|(_, pred_set)| { - pred_set.shrink_to_fit(); - }); - }); - self.osp.shrink_to_fit(); - }); - - s.spawn(|_| { - // PSO index - self.pso.par_iter_mut().for_each(|(_, subj_map)| { - subj_map.shrink_to_fit(); - subj_map.par_iter_mut().for_each(|(_, obj_set)| { - obj_set.shrink_to_fit(); - }); - }); - self.pso.shrink_to_fit(); - }); - - s.spawn(|_| { - // OPS index - self.ops.par_iter_mut().for_each(|(_, pred_map)| { - pred_map.shrink_to_fit(); - pred_map.par_iter_mut().for_each(|(_, subj_set)| { - subj_set.shrink_to_fit(); - }); - }); - self.ops.shrink_to_fit(); - }); - - s.spawn(|_| { - // SOP index - self.sop.par_iter_mut().for_each(|(_, obj_map)| { - obj_map.shrink_to_fit(); - obj_map.par_iter_mut().for_each(|(_, pred_set)| { - pred_set.shrink_to_fit(); - }); - }); - self.sop.shrink_to_fit(); - }); - }); - } - - /// Query the index - pub fn query(&self, s: Option, p: Option, o: Option) -> Vec { - let mut results = Vec::new(); - - match (s, p, o) { - // Fully bound - (Some(ss), Some(pp), Some(oo)) => { - if let Some(pred_map) = self.spo.get(&ss) { - if let Some(objects) = pred_map.get(&pp) { - if objects.contains(&oo) { - results.push(Triple { subject: ss, predicate: pp, object: oo }); - } - } - } - } - // (S, P, -) - (Some(ss), Some(pp), None) => { - if let Some(pred_map) = self.spo.get(&ss) { - if let Some(objects) = pred_map.get(&pp) { - for &obj in objects { - results.push(Triple { subject: ss, predicate: pp, object: obj }); - } - } - } - } - // (S, -, O) - (Some(ss), None, Some(oo)) => { - if let Some(obj_map) = self.sop.get(&ss) { - if let Some(predicates) = obj_map.get(&oo) { - for &pred in predicates { - results.push(Triple { subject: ss, predicate: pred, object: oo }); - } - } - } - } - // (-, P, O) - (None, Some(pp), Some(oo)) => { - if let Some(obj_map) = self.pos.get(&pp) { - if let Some(subjects) = obj_map.get(&oo) { - for &subj in subjects { - results.push(Triple { subject: subj, predicate: pp, object: oo }); - } - } - } - } - // (S, -, -) - (Some(ss), None, None) => { - if let Some(pred_map) = self.spo.get(&ss) { - for (&pred, objects) in pred_map { - for &obj in objects { - results.push(Triple { subject: ss, predicate: pred, object: obj }); - } - } - } - } - // (-, P, -) - (None, Some(pp), None) => { - if let Some(obj_map) = self.pso.get(&pp) { - for (&subj, objects) in obj_map { - for &obj in objects { - results.push(Triple { subject: subj, predicate: pp, object: obj }); - } - } - } - } - // (-, -, O) - (None, None, Some(oo)) => { - if let Some(pred_map) = self.ops.get(&oo) { - for (&pred, subjects) in pred_map { - for &subj in subjects { - results.push(Triple { subject: subj, predicate: pred, object: oo }); - } - } - } - } - // (-, -, -) => all - (None, None, None) => { - for (&subj, pred_map) in &self.spo { - for (&pred, objects) in pred_map { - for &obj in objects { - results.push(Triple { subject: subj, predicate: pred, object: obj }); - } - } - } - } - } - - results - } - - /// Return all triples that match a given `TriplePattern` - pub fn get_matching_triples(&self, pattern: &TriplePattern) -> Vec { - let (s, p, o) = pattern; - let sub = match s { - Constant(x) => Some(*x), - Variable(_) => None, - }; - let pre = match p { - Constant(x) => Some(*x), - Variable(_) => None, - }; - let obj = match o { - Constant(x) => Some(*x), - Variable(_) => None, - }; - - self.query(sub, pre, obj) - } - - /// Clear all data in the indexes - pub fn clear(&mut self) { - self.spo.clear(); - self.pos.clear(); - self.osp.clear(); - self.pso.clear(); - self.ops.clear(); - self.sop.clear(); - } - - /// Scan using the Subject-Predicate index (spo) - pub fn scan_sp(&self, s: u32, p: u32) -> Option<&HashSet> { - self.spo - .get(&s) - .and_then(|pred_map| pred_map.get(&p)) - } - - /// Scan using the Subject-Object index (sop) - pub fn scan_so(&self, s: u32, o: u32) -> Option<&HashSet> { - self.sop - .get(&s) - .and_then(|obj_map| obj_map.get(&o)) - } - - /// Scan using the Predicate-Object index (pos) - pub fn scan_po(&self, p: u32, o: u32) -> Option<&HashSet> { - self.pos - .get(&p) - .and_then(|obj_map| obj_map.get(&o)) - } - - pub fn scan_ps(&self, p: u32, s: u32) -> Option<&HashSet> { - self.pso - .get(&p) - .and_then(|subj_map| subj_map.get(&s)) - } - - pub fn scan_os(&self, o: u32, s: u32) -> Option<&HashSet> { - self.osp - .get(&o) - .and_then(|subj_map| subj_map.get(&s)) - } - - pub fn scan_op(&self, o: u32, p: u32) -> Option<&HashSet> { - self.ops - .get(&o) - .and_then(|pred_map| pred_map.get(&p)) - } - - /// Efficiently merge another index into this one using parallel processing where possible - pub fn merge_from(&mut self, other: UnifiedIndex) { - // Merge SPO index - for (s, pred_map) in other.spo { - let entry = self.spo.entry(s).or_insert_with(HashMap::new); - for (p, obj_set) in pred_map { - entry.entry(p).or_insert_with(HashSet::new).extend(obj_set); - } - } - - // Merge PSO index - for (p, subj_map) in other.pso { - let entry = self.pso.entry(p).or_insert_with(HashMap::new); - for (s, obj_set) in subj_map { - entry.entry(s).or_insert_with(HashSet::new).extend(obj_set); - } - } - - // Merge OPS index - for (o, pred_map) in other.ops { - let entry = self.ops.entry(o).or_insert_with(HashMap::new); - for (p, subj_set) in pred_map { - entry.entry(p).or_insert_with(HashSet::new).extend(subj_set); - } - } - - // Merge POS index - for (p, obj_map) in other.pos { - let entry = self.pos.entry(p).or_insert_with(HashMap::new); - for (o, subj_set) in obj_map { - entry.entry(o).or_insert_with(HashSet::new).extend(subj_set); - } - } - - // Merge OSP index - for (o, subj_map) in other.osp { - let entry = self.osp.entry(o).or_insert_with(HashMap::new); - for (s, pred_set) in subj_map { - entry.entry(s).or_insert_with(HashSet::new).extend(pred_set); - } - } - - // Merge SOP index - for (s, obj_map) in other.sop { - let entry = self.sop.entry(s).or_insert_with(HashMap::new); - for (o, pred_set) in obj_map { - entry.entry(o).or_insert_with(HashSet::new).extend(pred_set); - } - } - } - - pub fn optimize(&mut self) { - use rayon::prelude::*; - - // Optimize SPO index - self.spo.par_iter_mut().for_each(|(_, pred_map)| { - pred_map.par_iter_mut().for_each(|(_, obj_set)| { - obj_set.shrink_to_fit(); - }); - pred_map.shrink_to_fit(); - }); - self.spo.shrink_to_fit(); - - // Optimize PSO index - self.pso.par_iter_mut().for_each(|(_, subj_map)| { - subj_map.par_iter_mut().for_each(|(_, obj_set)| { - obj_set.shrink_to_fit(); - }); - subj_map.shrink_to_fit(); - }); - self.pso.shrink_to_fit(); - - // Optimize OPS index - self.ops.par_iter_mut().for_each(|(_, pred_map)| { - pred_map.par_iter_mut().for_each(|(_, subj_set)| { - subj_set.shrink_to_fit(); - }); - pred_map.shrink_to_fit(); - }); - self.ops.shrink_to_fit(); - - // Optimize POS index - self.pos.par_iter_mut().for_each(|(_, obj_map)| { - obj_map.par_iter_mut().for_each(|(_, subj_set)| { - subj_set.shrink_to_fit(); - }); - obj_map.shrink_to_fit(); - }); - self.pos.shrink_to_fit(); - - // Optimize OSP index - self.osp.par_iter_mut().for_each(|(_, subj_map)| { - subj_map.par_iter_mut().for_each(|(_, pred_set)| { - pred_set.shrink_to_fit(); - }); - subj_map.shrink_to_fit(); - }); - self.osp.shrink_to_fit(); - - // Optimize SOP index - self.sop.par_iter_mut().for_each(|(_, obj_map)| { - obj_map.par_iter_mut().for_each(|(_, pred_set)| { - pred_set.shrink_to_fit(); - }); - obj_map.shrink_to_fit(); - }); - self.sop.shrink_to_fit(); - } -} - -/// Helper function to remove a triple from a nested index structure and clean up empty collections -#[inline] -fn remove_from_index( - index: &mut HashMap>>, - key1: u32, - key2: u32, - value: u32, -) { - if let Some(inner_map) = index.get_mut(&key1) { - if let Some(set) = inner_map.get_mut(&key2) { - set.remove(&value); - // Clean up empty inner set - if set.is_empty() { - inner_map.remove(&key2); - } - } - // Clean up empty inner map - if inner_map.is_empty() { - index.remove(&key1); - } - } -} +use serde::{Serialize, Deserialize}; +use std::collections::{HashMap, HashSet}; +use crate::terms::*; +use crate::triple::Triple; +use crate::index_manager::*; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct HexastoreIndex { + // The six permutations, using HashMap of HashMap of HashSet. + pub spo: HashMap>>, + pub pos: HashMap>>, + pub osp: HashMap>>, + pub pso: HashMap>>, + pub ops: HashMap>>, + pub sop: HashMap>>, +} + +impl TripleIndex for HexastoreIndex { + fn new() -> Self { + Self { + spo: HashMap::new(), + pos: HashMap::new(), + osp: HashMap::new(), + pso: HashMap::new(), + ops: HashMap::new(), + sop: HashMap::new(), + } + } + + fn supported_access_patterns(&self) -> AccessPatternSupport { + AccessPatternSupport { + sp: true, so: true, po: true, + ps: true, os: true, op: true + } + } + + /// Insert a single triple into all six indexes + fn insert(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + if let Some(pred_map) = self.spo.get(&s) { + if let Some(objects) = pred_map.get(&p) { + if objects.contains(&o) { + return false; // triple already stored + } + } + } + self.spo.entry(s).or_default().entry(p).or_default().insert(o); + self.pos.entry(p).or_default().entry(o).or_default().insert(s); + self.osp.entry(o).or_default().entry(s).or_default().insert(p); + self.pso.entry(p).or_default().entry(s).or_default().insert(o); + self.ops.entry(o).or_default().entry(p).or_default().insert(s); + self.sop.entry(s).or_default().entry(o).or_default().insert(p); + true + } + + /// Delete a single triple from all six indexes + fn delete(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + + let exists = self.spo + .get(&s) + .and_then(|pred_map| pred_map.get(&p)) + .map_or(false, |objects| objects.contains(&o)); + + if !exists { + return false; // triple doesn't exist + } + + // Remove from all six indexes using helper function + remove_from_index(&mut self.spo, s, p, o); + remove_from_index(&mut self.pos, p, o, s); + remove_from_index(&mut self.osp, o, s, p); + remove_from_index(&mut self.pso, p, s, o); + remove_from_index(&mut self.ops, o, p, s); + remove_from_index(&mut self.sop, s, o, p); + true + } + + /// Bulk-build the index from a list of triples + fn build_from_triples(&mut self, triples: &[Triple]) { + for triple in triples { + self.insert(triple); + } + use rayon::prelude::*; + + self.clear(); + + if triples.is_empty() { + return; + } + + // Pre-allocate with capacity estimates + let capacity = triples.len() / 100; + + self.spo.reserve(capacity); + self.pos.reserve(capacity); + self.osp.reserve(capacity); + self.pso.reserve(capacity); + self.ops.reserve(capacity); + self.sop.reserve(capacity); + + // Build indexes in parallel by creating partial indexes and merging + let num_threads = rayon::current_num_threads(); + let chunk_size = (triples.len() / num_threads).max(10_000); + + let partial_indexes: Vec = triples + .par_chunks(chunk_size) + .map(|chunk| { + let mut local_index = HexastoreIndex::new(); + + // Pre-allocate local index + let local_capacity = chunk.len() / 50; + local_index.spo.reserve(local_capacity); + local_index.pos.reserve(local_capacity); + local_index.osp.reserve(local_capacity); + local_index.pso.reserve(local_capacity); + local_index.ops.reserve(local_capacity); + local_index.sop.reserve(local_capacity); + + // Insert triples into local index + for triple in chunk { + local_index.insert_optimized(triple); + } + + local_index + }) + .collect(); + + // Sequentially merge partial indexes + for partial_index in partial_indexes { + self.merge_from(partial_index); + } + + // Optimize memory layout after building + self.optimize_post_build(); + } + + /// Query the index + fn query(&self, s: Option, p: Option, o: Option) -> Vec { + let mut results = Vec::new(); + + match (s, p, o) { + // Fully bound + (Some(ss), Some(pp), Some(oo)) => { + if let Some(pred_map) = self.spo.get(&ss) { + if let Some(objects) = pred_map.get(&pp) { + if objects.contains(&oo) { + results.push(Triple { subject: ss, predicate: pp, object: oo }); + } + } + } + } + // (S, P, -) + (Some(ss), Some(pp), None) => { + if let Some(pred_map) = self.spo.get(&ss) { + if let Some(objects) = pred_map.get(&pp) { + for &obj in objects { + results.push(Triple { subject: ss, predicate: pp, object: obj }); + } + } + } + } + // (S, -, O) + (Some(ss), None, Some(oo)) => { + if let Some(obj_map) = self.sop.get(&ss) { + if let Some(predicates) = obj_map.get(&oo) { + for &pred in predicates { + results.push(Triple { subject: ss, predicate: pred, object: oo }); + } + } + } + } + // (-, P, O) + (None, Some(pp), Some(oo)) => { + if let Some(obj_map) = self.pos.get(&pp) { + if let Some(subjects) = obj_map.get(&oo) { + for &subj in subjects { + results.push(Triple { subject: subj, predicate: pp, object: oo }); + } + } + } + } + // (S, -, -) + (Some(ss), None, None) => { + if let Some(pred_map) = self.spo.get(&ss) { + for (&pred, objects) in pred_map { + for &obj in objects { + results.push(Triple { subject: ss, predicate: pred, object: obj }); + } + } + } + } + // (-, P, -) + (None, Some(pp), None) => { + if let Some(obj_map) = self.pso.get(&pp) { + for (&subj, objects) in obj_map { + for &obj in objects { + results.push(Triple { subject: subj, predicate: pp, object: obj }); + } + } + } + } + // (-, -, O) + (None, None, Some(oo)) => { + if let Some(pred_map) = self.ops.get(&oo) { + for (&pred, subjects) in pred_map { + for &subj in subjects { + results.push(Triple { subject: subj, predicate: pred, object: oo }); + } + } + } + } + // (-, -, -) => all + (None, None, None) => { + for (&subj, pred_map) in &self.spo { + for (&pred, objects) in pred_map { + for &obj in objects { + results.push(Triple { subject: subj, predicate: pred, object: obj }); + } + } + } + } + } + + results + } + + /// Return all triples that match a given `TriplePattern` + fn get_matching_triples(&self, pattern: &TriplePattern) -> Vec { + let (s, p, o) = pattern; + let sub = match s { + Constant(x) => Some(*x), + Variable(_) => None, + }; + let pre = match p { + Constant(x) => Some(*x), + Variable(_) => None, + }; + let obj = match o { + Constant(x) => Some(*x), + Variable(_) => None, + }; + + self.query(sub, pre, obj) + } + + /// Clear all data in the indexes + fn clear(&mut self) { + self.spo.clear(); + self.pos.clear(); + self.osp.clear(); + self.pso.clear(); + self.ops.clear(); + self.sop.clear(); + } + + /// Scan using the Subject-Predicate index (spo) + fn scan_sp(&self, s: u32, p: u32) -> Option<&HashSet> { + self.spo + .get(&s) + .and_then(|pred_map| pred_map.get(&p)) + } + + /// Scan using the Subject-Object index (sop) + fn scan_so(&self, s: u32, o: u32) -> Option<&HashSet> { + self.sop + .get(&s) + .and_then(|obj_map| obj_map.get(&o)) + } + + /// Scan using the Predicate-Object index (pos) + fn scan_po(&self, p: u32, o: u32) -> Option<&HashSet> { + self.pos + .get(&p) + .and_then(|obj_map| obj_map.get(&o)) + } + + /// Scan using the Predicate-Subject index (pso) + fn scan_ps(&self, p: u32, s: u32) -> Option<&HashSet> { + self.pso + .get(&p) + .and_then(|subj_map| subj_map.get(&s)) + } + + /// Scan using the Object-Subject index (osp) + fn scan_os(&self, o: u32, s: u32) -> Option<&HashSet> { + self.osp + .get(&o) + .and_then(|subj_map| subj_map.get(&s)) + } + + /// Scan using the Object-Predicate index (ops) + fn scan_op(&self, o: u32, p: u32) -> Option<&HashSet> { + self.ops + .get(&o) + .and_then(|pred_map| pred_map.get(&p)) + } + + fn optimize(&mut self) { + use rayon::prelude::*; + + // Optimize SPO index + self.spo.par_iter_mut().for_each(|(_, pred_map)| { + pred_map.par_iter_mut().for_each(|(_, obj_set)| { + obj_set.shrink_to_fit(); + }); + pred_map.shrink_to_fit(); + }); + self.spo.shrink_to_fit(); + + // Optimize PSO index + self.pso.par_iter_mut().for_each(|(_, subj_map)| { + subj_map.par_iter_mut().for_each(|(_, obj_set)| { + obj_set.shrink_to_fit(); + }); + subj_map.shrink_to_fit(); + }); + self.pso.shrink_to_fit(); + + // Optimize OPS index + self.ops.par_iter_mut().for_each(|(_, pred_map)| { + pred_map.par_iter_mut().for_each(|(_, subj_set)| { + subj_set.shrink_to_fit(); + }); + pred_map.shrink_to_fit(); + }); + self.ops.shrink_to_fit(); + + // Optimize POS index + self.pos.par_iter_mut().for_each(|(_, obj_map)| { + obj_map.par_iter_mut().for_each(|(_, subj_set)| { + subj_set.shrink_to_fit(); + }); + obj_map.shrink_to_fit(); + }); + self.pos.shrink_to_fit(); + + // Optimize OSP index + self.osp.par_iter_mut().for_each(|(_, subj_map)| { + subj_map.par_iter_mut().for_each(|(_, pred_set)| { + pred_set.shrink_to_fit(); + }); + subj_map.shrink_to_fit(); + }); + self.osp.shrink_to_fit(); + + // Optimize SOP index + self.sop.par_iter_mut().for_each(|(_, obj_map)| { + obj_map.par_iter_mut().for_each(|(_, pred_set)| { + pred_set.shrink_to_fit(); + }); + obj_map.shrink_to_fit(); + }); + self.sop.shrink_to_fit(); + } + +} + +impl HexastoreIndex { + /// Efficiently merge another index into this one using parallel processing where possible + pub fn merge_from(&mut self, other: HexastoreIndex) { + // Merge SPO index + for (s, pred_map) in other.spo { + let entry = self.spo.entry(s).or_insert_with(HashMap::new); + for (p, obj_set) in pred_map { + entry.entry(p).or_insert_with(HashSet::new).extend(obj_set); + } + } + + // Merge PSO index + for (p, subj_map) in other.pso { + let entry = self.pso.entry(p).or_insert_with(HashMap::new); + for (s, obj_set) in subj_map { + entry.entry(s).or_insert_with(HashSet::new).extend(obj_set); + } + } + + // Merge OPS index + for (o, pred_map) in other.ops { + let entry = self.ops.entry(o).or_insert_with(HashMap::new); + for (p, subj_set) in pred_map { + entry.entry(p).or_insert_with(HashSet::new).extend(subj_set); + } + } + + // Merge POS index + for (p, obj_map) in other.pos { + let entry = self.pos.entry(p).or_insert_with(HashMap::new); + for (o, subj_set) in obj_map { + entry.entry(o).or_insert_with(HashSet::new).extend(subj_set); + } + } + + // Merge OSP index + for (o, subj_map) in other.osp { + let entry = self.osp.entry(o).or_insert_with(HashMap::new); + for (s, pred_set) in subj_map { + entry.entry(s).or_insert_with(HashSet::new).extend(pred_set); + } + } + + // Merge SOP index + for (s, obj_map) in other.sop { + let entry = self.sop.entry(s).or_insert_with(HashMap::new); + for (o, pred_set) in obj_map { + entry.entry(o).or_insert_with(HashSet::new).extend(pred_set); + } + } + } + + #[inline] + fn insert_optimized(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + + // Check for duplicates only in SPO index (most selective) + if let Some(pred_map) = self.spo.get(&s) { + if let Some(objects) = pred_map.get(&p) { + if objects.contains(&o) { + return false; + } + } + } + + // Batch insert into all indexes + self.spo.entry(s).or_insert_with(|| HashMap::with_capacity(8)) + .entry(p).or_insert_with(|| HashSet::with_capacity(16)) + .insert(o); + + self.pos.entry(p).or_insert_with(|| HashMap::with_capacity(16)) + .entry(o).or_insert_with(|| HashSet::with_capacity(8)) + .insert(s); + + self.osp.entry(o).or_insert_with(|| HashMap::with_capacity(8)) + .entry(s).or_insert_with(|| HashSet::with_capacity(16)) + .insert(p); + + self.pso.entry(p).or_insert_with(|| HashMap::with_capacity(16)) + .entry(s).or_insert_with(|| HashSet::with_capacity(8)) + .insert(o); + + self.ops.entry(o).or_insert_with(|| HashMap::with_capacity(16)) + .entry(p).or_insert_with(|| HashSet::with_capacity(8)) + .insert(s); + + self.sop.entry(s).or_insert_with(|| HashMap::with_capacity(8)) + .entry(o).or_insert_with(|| HashSet::with_capacity(16)) + .insert(p); + + true + } + + fn optimize_post_build(&mut self) { + use rayon::prelude::*; + + // Parallelize the optimization of each index + rayon::scope(|s| { + s.spawn(|_| { + // SPO index + self.spo.par_iter_mut().for_each(|(_, pred_map)| { + pred_map.shrink_to_fit(); + pred_map.par_iter_mut().for_each(|(_, obj_set)| { + obj_set.shrink_to_fit(); + }); + }); + self.spo.shrink_to_fit(); + }); + + s.spawn(|_| { + // POS index + self.pos.par_iter_mut().for_each(|(_, obj_map)| { + obj_map.shrink_to_fit(); + obj_map.par_iter_mut().for_each(|(_, subj_set)| { + subj_set.shrink_to_fit(); + }); + }); + self.pos.shrink_to_fit(); + }); + + s.spawn(|_| { + // OSP index + self.osp.par_iter_mut().for_each(|(_, subj_map)| { + subj_map.shrink_to_fit(); + subj_map.par_iter_mut().for_each(|(_, pred_set)| { + pred_set.shrink_to_fit(); + }); + }); + self.osp.shrink_to_fit(); + }); + + s.spawn(|_| { + // PSO index + self.pso.par_iter_mut().for_each(|(_, subj_map)| { + subj_map.shrink_to_fit(); + subj_map.par_iter_mut().for_each(|(_, obj_set)| { + obj_set.shrink_to_fit(); + }); + }); + self.pso.shrink_to_fit(); + }); + + s.spawn(|_| { + // OPS index + self.ops.par_iter_mut().for_each(|(_, pred_map)| { + pred_map.shrink_to_fit(); + pred_map.par_iter_mut().for_each(|(_, subj_set)| { + subj_set.shrink_to_fit(); + }); + }); + self.ops.shrink_to_fit(); + }); + + s.spawn(|_| { + // SOP index + self.sop.par_iter_mut().for_each(|(_, obj_map)| { + obj_map.shrink_to_fit(); + obj_map.par_iter_mut().for_each(|(_, pred_set)| { + pred_set.shrink_to_fit(); + }); + }); + self.sop.shrink_to_fit(); + }); + }); + } +} \ No newline at end of file diff --git a/shared/src/index_manager/mod.rs b/shared/src/index_manager/mod.rs new file mode 100644 index 0000000..adffc14 --- /dev/null +++ b/shared/src/index_manager/mod.rs @@ -0,0 +1,100 @@ +/* + * Copyright © 2024 Volodymyr Kadzhaia + * Copyright © 2024 Pieter Bonte + * KU Leuven — Stream Intelligence Lab, Belgium + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * you can obtain one at https://mozilla.org/MPL/2.0/. + */ + +use serde::{Serialize, Deserialize}; +use std::collections::{HashMap, HashSet}; +use crate::terms::*; +use crate::terms::Term::*; +use crate::triple::Triple; + +pub use hexastore::HexastoreIndex; +pub use pso_single::PSOSingleIndex; +pub mod hexastore; +pub mod pso_single; + +/// Describes which access patterns an index can serve efficiently. +#[derive(Debug, Clone)] +pub struct AccessPatternSupport { + pub sp: bool, // subject+predicate -> objects + pub so: bool, // subject+object -> predicates + pub po: bool, // predicate+object -> subjects + pub ps: bool, // predicate+subject -> objects + pub os: bool, // object+subject -> predicates + pub op: bool, // object+predicate -> subjects +} + +pub trait TripleIndex: Send + Sync { + // ── Mutation ── + fn new() -> Self; + fn insert(&mut self, triple: &Triple) -> bool; + fn delete(&mut self, triple: &Triple) -> bool; + fn clear(&mut self); + + // ── Pattern query ── + /// Returns all triples matching the (s?, p?, o?) pattern. + /// Always works regardless of existing indexes. + fn query(&self, s: Option, p: Option, o: Option) -> Vec; + + /// Same as query but works with TriplePattern (for convenience). + fn get_matching_triples(&self, pattern: &TriplePattern) -> Vec; + + // ── Two-key scans ── + // These return None if the index doesn't support this access path + // efficiently — the engine will then fall back to query() + filter. + fn scan_sp(&self, s: u32, p: u32) -> Option<&HashSet>; + fn scan_so(&self, s: u32, o: u32) -> Option<&HashSet>; + fn scan_po(&self, p: u32, o: u32) -> Option<&HashSet>; + fn scan_ps(&self, p: u32, s: u32) -> Option<&HashSet>; + fn scan_os(&self, o: u32, s: u32) -> Option<&HashSet>; + fn scan_op(&self, o: u32, p: u32) -> Option<&HashSet>; + + // ── Bulk operations ── + + /// Absorb all triples from a slice. The default implementation + /// calls insert() in a loop, concrete types can override with + /// a faster path. + fn build_from_triples(&mut self, triples: &[Triple]) { + for triple in triples { + self.insert(triple); + } + } + + /// Reclaim wasted memory / compact internal data structures. + /// The default is to do nothing, concrete types override if they + /// have internal structures that benefit from compaction. + fn optimize(&mut self) {} + + // ── Metadata ── + /// Reports which access patterns this index supports efficiently. + fn supported_access_patterns(&self) -> AccessPatternSupport; +} + +/// Helper function to remove a triple from a nested index structure and clean up empty collections +#[inline] +fn remove_from_index( + index: &mut HashMap>>, + key1: u32, + key2: u32, + value: u32, +) { + if let Some(inner_map) = index.get_mut(&key1) { + if let Some(set) = inner_map.get_mut(&key2) { + set.remove(&value); + // Clean up empty inner set + if set.is_empty() { + inner_map.remove(&key2); + } + } + // Clean up empty inner map + if inner_map.is_empty() { + index.remove(&key1); + } + } +} diff --git a/shared/src/index_manager/pso_single.rs b/shared/src/index_manager/pso_single.rs new file mode 100644 index 0000000..1c607f5 --- /dev/null +++ b/shared/src/index_manager/pso_single.rs @@ -0,0 +1,314 @@ +use serde::{Serialize, Deserialize}; +use std::collections::{HashMap, HashSet}; +use crate::terms::*; +use crate::triple::Triple; +use crate::index_manager::*; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PSOSingleIndex { + // The six permutations, using HashMap of HashMap of HashSet. + pub pso: HashMap>>, +} + +impl TripleIndex for PSOSingleIndex { + fn new() -> Self { + Self { + pso: HashMap::new(), + } + } + + fn supported_access_patterns(&self) -> AccessPatternSupport { + AccessPatternSupport { + sp: false, so: false, po: false, + ps: true, os: false, op: false + } + } + + /// Insert a single triple into all six indexes + fn insert(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + if let Some(sub_map) = self.pso.get(&p) { + if let Some(objects) = sub_map.get(&s) { + if objects.contains(&o) { + return false; // triple already stored + } + } + } + self.pso.entry(p).or_default().entry(s).or_default().insert(o); + true + } + + /// Delete a single triple from all six indexes + fn delete(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + + let exists = self.pso + .get(&p) + .and_then(|sub_map| sub_map.get(&s)) + .map_or(false, |objects| objects.contains(&o)); + + if !exists { + return false; // triple doesn't exist + } + + // Remove from all six indexes using helper function + remove_from_index(&mut self.pso, p, s, o); + true + } + + /// Bulk-build the index from a list of triples + fn build_from_triples(&mut self, triples: &[Triple]) { + for triple in triples { + self.insert(triple); + } + use rayon::prelude::*; + + self.clear(); + + if triples.is_empty() { + return; + } + + // Pre-allocate with capacity estimates + let capacity = triples.len() / 100; + + self.pso.reserve(capacity); + + // Build indexes in parallel by creating partial indexes and merging + let num_threads = rayon::current_num_threads(); + let chunk_size = (triples.len() / num_threads).max(10_000); + + let partial_indexes: Vec = triples + .par_chunks(chunk_size) + .map(|chunk| { + let mut local_index = PSOSingleIndex::new(); + + // Pre-allocate local index + let local_capacity = chunk.len() / 50; + local_index.pso.reserve(local_capacity); + + // Insert triples into local index + for triple in chunk { + local_index.insert_optimized(triple); + } + + local_index + }) + .collect(); + + // Sequentially merge partial indexes + for partial_index in partial_indexes { + self.merge_from(partial_index); + } + + // Optimize memory layout after building + self.optimize_post_build(); + } + + /// Query the index + fn query(&self, s: Option, p: Option, o: Option) -> Vec { + let mut results = Vec::new(); + + match (s, p, o) { + // Fully bound + (Some(ss), Some(pp), Some(oo)) => { + if let Some(sub_map) = self.pso.get(&pp) { + if let Some(objects) = sub_map.get(&ss) { + if objects.contains(&oo) { + results.push(Triple { subject: ss, predicate: pp, object: oo }); + } + } + } + } + // (S, P, -) + (Some(ss), Some(pp), None) => { + if let Some(sub_map) = self.pso.get(&pp) { + if let Some(objects) = sub_map.get(&ss) { + for &obj in objects { + results.push(Triple { subject: ss, predicate: pp, object: obj }); + } + } + } + } + // (S, -, O) + (Some(ss), None, Some(oo)) => { + for (&pred, sub_map) in &self.pso { + if let Some(objects) = sub_map.get(&ss) { + if objects.contains(&oo) { + results.push(Triple { subject: ss, predicate: pred, object: oo }) + } + } + } + } + // (-, P, O) + (None, Some(pp), Some(oo)) => { + if let Some(sub_map) = self.pso.get(&pp) { + for (&sub, objects) in sub_map { + if objects.contains(&oo) { + results.push(Triple { subject: sub, predicate: pp, object: oo }) + } + } + } + } + // (S, -, -) + (Some(ss), None, None) => { + for (&pred, sub_map) in &self.pso { + if let Some(objects) = sub_map.get(&ss) { + for &obj in objects { + results.push(Triple { subject: ss, predicate: pred, object: obj }) + } + } + } + } + // (-, P, -) + (None, Some(pp), None) => { + if let Some(sub_map) = self.pso.get(&pp) { + for (&sub, objects) in sub_map { + for &obj in objects { + results.push(Triple { subject: sub, predicate: pp, object: obj }) + } + } + } + } + // (-, -, O) + (None, None, Some(oo)) => { + for (&pred, sub_map) in &self.pso { + for (&sub, objects) in sub_map { + if objects.contains(&oo) { + results.push(Triple { subject: sub, predicate: pred, object: oo }); + } + } + } + } + // (-, -, -) => all + (None, None, None) => { + for (&pred, sub_map) in &self.pso { + for (&sub, objects) in sub_map { + for &obj in objects { + results.push(Triple { subject: sub, predicate: pred, object: obj }); + } + } + } + } + } + + results + } + + /// Return all triples that match a given `TriplePattern` + fn get_matching_triples(&self, pattern: &TriplePattern) -> Vec { + let (s, p, o) = pattern; + let sub = match s { + Constant(x) => Some(*x), + Variable(_) => None, + }; + let pre = match p { + Constant(x) => Some(*x), + Variable(_) => None, + }; + let obj = match o { + Constant(x) => Some(*x), + Variable(_) => None, + }; + + self.query(sub, pre, obj) + } + + /// Clear all data in the indexes + fn clear(&mut self) { + self.pso.clear(); + } + + /// Scan using the Subject-Predicate index (spo) + fn scan_sp(&self, _: u32, _: u32) -> Option<&HashSet> { + return None; + } + + /// Scan using the Subject-Object index (sop) + fn scan_so(&self, _: u32, _: u32) -> Option<&HashSet> { + return None; + } + + /// Scan using the Predicate-Object index (pos) + fn scan_po(&self, _: u32, _: u32) -> Option<&HashSet> { + return None; + } + + /// Scan using the Predicate-Subject index (pso) + fn scan_ps(&self, p: u32, s: u32) -> Option<&HashSet> { + self.pso + .get(&p) + .and_then(|subj_map| subj_map.get(&s)) + } + + /// Scan using the Object-Subject index (osp) + fn scan_os(&self, _: u32, _: u32) -> Option<&HashSet> { + return None; + } + + /// Scan using the Object-Predicate index (ops) + fn scan_op(&self, _: u32, _: u32) -> Option<&HashSet> { + return None; + } + + fn optimize(&mut self) { + use rayon::prelude::*; + + // Optimize PSO index + self.pso.par_iter_mut().for_each(|(_, subj_map)| { + subj_map.par_iter_mut().for_each(|(_, obj_set)| { + obj_set.shrink_to_fit(); + }); + subj_map.shrink_to_fit(); + }); + self.pso.shrink_to_fit(); + } + +} + +impl PSOSingleIndex { + /// Efficiently merge another index into this one using parallel processing where possible + pub fn merge_from(&mut self, other: PSOSingleIndex) { + + // Merge PSO index + for (p, subj_map) in other.pso { + let entry = self.pso.entry(p).or_insert_with(HashMap::new); + for (s, obj_set) in subj_map { + entry.entry(s).or_insert_with(HashSet::new).extend(obj_set); + } + } + } + + #[inline] + fn insert_optimized(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + + // Check for duplicates + if let Some(sub_map) = self.pso.get(&p) { + if let Some(objects) = sub_map.get(&s) { + if objects.contains(&o) { + return false; + } + } + } + + // Batch insert into all indexes + self.pso.entry(p).or_insert_with(|| HashMap::with_capacity(16)) + .entry(s).or_insert_with(|| HashSet::with_capacity(8)) + .insert(o); + + true + } + + fn optimize_post_build(&mut self) { + use rayon::prelude::*; + + self.pso.par_iter_mut().for_each(|(_, subj_map)| { + subj_map.shrink_to_fit(); + subj_map.par_iter_mut().for_each(|(_, obj_set)| { + obj_set.shrink_to_fit(); + }); + }); + self.pso.shrink_to_fit(); + } +} \ No newline at end of file diff --git a/shared/src/join_algorithm.rs b/shared/src/join_algorithm.rs index fccd9b7..6b82fe8 100644 --- a/shared/src/join_algorithm.rs +++ b/shared/src/join_algorithm.rs @@ -10,7 +10,7 @@ use crate::dictionary::Dictionary; use crate::triple::Triple; -use crate::index_manager::UnifiedIndex; +use crate::index_manager::*; use std::collections::{BTreeMap, HashMap}; use std::sync::Arc; use rayon::prelude::*; @@ -19,7 +19,7 @@ pub fn perform_join_par_simd_with_strict_filter_4_redesigned_streaming( subject_var: String, predicate: String, object_var: String, - index_manager: &UnifiedIndex, // ← Pass index instead of database + index_manager: Box, // ← Pass index instead of database dictionary: &Dictionary, final_results: Vec>, literal_filter: Option, @@ -46,45 +46,15 @@ pub fn perform_join_par_simd_with_strict_filter_4_redesigned_streaming( dictionary, ); - // FIX: Use PSO index instead of POS for better ordering let mut filtered_triples: Vec = if let Some(pred_id) = predicate_id { - // Use PSO index (Predicate -> Subject -> Object) - // This gives results sorted by subject first! - if let Some(subject_map) = index_manager. pso.get(&pred_id) { - // Collect subjects in sorted order - let mut subjects: Vec<_> = subject_map.iter().collect(); - subjects.sort_unstable_by_key(|(subj, _)| *subj); // Sort by subject - - subjects - .par_iter() - .flat_map(|(&subject, objects)| { - // Objects are in HashSet, convert to sorted Vec - let mut sorted_objects: Vec = objects.iter().copied().collect(); - sorted_objects.sort_unstable(); // Sort objects within each subject - - // Build triples - naturally sorted by (subject, object)! - sorted_objects - .into_iter() - .filter_map(|object| { - // Apply literal filter if present - if let Some(filter_id) = literal_filter_id { - if object != filter_id { - return None; - } - } - - Some(Triple { - subject, - predicate: pred_id, - object, - }) - }) - .collect::>() - }) - .collect() - } else { - Vec::new() + let mut triples = index_manager.query(None, Some(pred_id), None); + + // Apply literal filter if present + if let Some(filter_id) = literal_filter_id { + triples.retain(|t| t.object == filter_id); } + + triples } else { Vec::new() }; From b8b8986f46f66e5c38879ef027b583d6bb0765e2 Mon Sep 17 00:00:00 2001 From: Mirovh <94124714+Mirovh@users.noreply.github.com> Date: Mon, 16 Feb 2026 02:23:13 +0100 Subject: [PATCH 03/23] Add support for manual datastore type override --- datalog/src/reasoning.rs | 11 +- kolibrie/src/sparql_database.rs | 11 +- .../execution/engine.rs | 170 ++++++++---------- shared/src/index_manager/hexastore.rs | 29 ++- shared/src/index_manager/mod.rs | 17 +- shared/src/index_manager/pso_single.rs | 18 +- 6 files changed, 138 insertions(+), 118 deletions(-) diff --git a/datalog/src/reasoning.rs b/datalog/src/reasoning.rs index 73fc34a..9c2478b 100644 --- a/datalog/src/reasoning.rs +++ b/datalog/src/reasoning.rs @@ -11,7 +11,7 @@ use shared::dictionary::Dictionary; use shared::triple::Triple; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; -use shared::index_manager::*; +use shared::index_manager::TripleIndex; use shared::rule_index::RuleIndex; use shared::terms::{Term, TriplePattern}; use shared::rule::Rule; @@ -26,18 +26,21 @@ use shared::rule::FilterCondition; pub struct Reasoner { pub dictionary: Dictionary, pub rules: Vec, // List of dynamic rules - - pub index_manager: Box, + pub index_manager: Box, pub rule_index: RuleIndex, pub constraints: Vec, } impl Reasoner { pub fn new() -> Self { + Self::with_index(Box::new(shared::index_manager::HexastoreIndex::new())) + } + + pub fn with_index(index: Box) -> Self { Self { dictionary: Dictionary::new(), rules: Vec::new(), - index_manager: Box::new(HexastoreIndex::new()), + index_manager: index, rule_index: RuleIndex::new(), constraints: Vec::new(), } diff --git a/kolibrie/src/sparql_database.rs b/kolibrie/src/sparql_database.rs index f35ccbf..3c920b9 100644 --- a/kolibrie/src/sparql_database.rs +++ b/kolibrie/src/sparql_database.rs @@ -19,7 +19,7 @@ use crate::utils::current_timestamp; use crate::utils::ClonableFn; #[cfg(feature = "cuda")] use crate::cuda::cuda_join::*; -use shared::index_manager::*; +use shared::index_manager::TripleIndex; use crate::query_builder::QueryBuilder; use crossbeam::channel::unbounded; use crossbeam::scope; @@ -52,7 +52,7 @@ pub struct SparqlDatabase { pub dictionary: Dictionary, pub prefixes: HashMap, pub udfs: HashMap, - pub index_manager: Box, + pub index_manager: Box, pub rule_map: HashMap, pub cached_stats: Option>, } @@ -60,6 +60,11 @@ pub struct SparqlDatabase { #[allow(dead_code)] impl SparqlDatabase { pub fn new() -> Self { + Self::with_index(Box::new(shared::index_manager::HexastoreIndex::new())) + } + + /// Creates a new database with a user-chosen indexing strategy. + pub fn with_index(index: Box) -> Self { Self { triples: BTreeSet::new(), streams: Vec::new(), @@ -67,7 +72,7 @@ impl SparqlDatabase { dictionary: Dictionary::new(), prefixes: HashMap::new(), udfs: HashMap::new(), - index_manager: Box::new(HexastoreIndex::new()), + index_manager: index, rule_map: HashMap::new(), cached_stats: None, } diff --git a/kolibrie/src/streamertail_optimizer/execution/engine.rs b/kolibrie/src/streamertail_optimizer/execution/engine.rs index b46241b..1e6493f 100644 --- a/kolibrie/src/streamertail_optimizer/execution/engine.rs +++ b/kolibrie/src/streamertail_optimizer/execution/engine.rs @@ -970,17 +970,11 @@ impl ExecutionEngine { match pattern { // FULLY BOUND (3 constants) - just check if triple exists (Term::Constant(s), Term::Constant(p), Term::Constant(o)) => { - // Use SPO index to check existence - if let Some(pred_map) = database.index_manager.spo.get(s) { - if let Some(objects) = pred_map.get(p) { - if objects.contains(o) { - // Triple exists - return empty binding (no variables to bind) - return vec![HashMap::new()]; - } - } + if !database.index_manager.query(Some(*s), Some(*p), Some(*o)).is_empty() { + return vec![HashMap::new()]; + } else { + return Vec::new(); } - // Triple doesn't exist - Vec::new() } // TWO BOUNDS (2 constants, 1 variable) @@ -1020,22 +1014,25 @@ impl ExecutionEngine { predicate: u32, object_var: String, ) -> Vec> { - // Strip '?' prefix from variable name let object_var = object_var.strip_prefix('?').unwrap_or(&object_var).to_string(); - if let Some(pred_map) = database.index_manager.spo.get(&subject) { - if let Some(objects) = pred_map.get(&predicate) { - // Use pre-compute the key - objects.iter().map(|&object| { - let mut result = HashMap::with_capacity(1); // Pre-size - result.insert(object_var.clone(), object); // Still need clone in closure - result - }).collect() - } else { - Vec::new() - } + // Try efficient two-key scan first + if let Some(objects) = database.index_manager.scan_sp(subject, predicate) { + objects.iter().map(|&object| { + let mut result = HashMap::with_capacity(1); + result.insert(object_var.clone(), object); + result + }).collect() } else { - Vec::new() + // Fallback: query(Some(s), Some(p), None) + database.index_manager.query(Some(subject), Some(predicate), None) + .into_iter() + .map(|triple| { + let mut result = HashMap::with_capacity(1); + result.insert(object_var.clone(), triple.object); + result + }) + .collect() } } @@ -1046,22 +1043,24 @@ impl ExecutionEngine { object: u32, predicate_var: String, ) -> Vec> { - // Strip '?' prefix from variable name let predicate_var = predicate_var.strip_prefix('?').unwrap_or(&predicate_var).to_string(); - if let Some(obj_map) = database.index_manager.sop.get(&subject) { - if let Some(predicates) = obj_map.get(&object) { - // Use iterator with pre-sized HashMap - predicates.iter().map(|&predicate| { + if let Some(predicates) = database.index_manager.scan_so(subject, object) { + predicates.iter().map(|&predicate| { + let mut result = HashMap::with_capacity(1); + result.insert(predicate_var.clone(), predicate); + result + }).collect() + } else { + // Fallback: query(Some(s), None, Some(o)) + database.index_manager.query(Some(subject), None, Some(object)) + .into_iter() + .map(|triple| { let mut result = HashMap::with_capacity(1); - result.insert(predicate_var.clone(), predicate); + result.insert(predicate_var.clone(), triple.predicate); result - }).collect() - } else { - Vec::new() - } - } else { - Vec::new() + }) + .collect() } } @@ -1072,22 +1071,24 @@ impl ExecutionEngine { object: u32, subject_var: String, ) -> Vec> { - // Strip '?' prefix from variable name let subject_var = subject_var.strip_prefix('?').unwrap_or(&subject_var).to_string(); - if let Some(obj_map) = database.index_manager.pos.get(&predicate) { - if let Some(subjects) = obj_map.get(&object) { - // Use iterator with pre-sized HashMap - subjects.iter().map(|&subject| { + if let Some(subjects) = database.index_manager.scan_po(predicate, object) { + subjects.iter().map(|&subject| { + let mut result = HashMap::with_capacity(1); + result.insert(subject_var.clone(), subject); + result + }).collect() + } else { + // Fallback: query(None, Some(p), Some(o)) + database.index_manager.query(None, Some(predicate), Some(object)) + .into_iter() + .map(|triple| { let mut result = HashMap::with_capacity(1); - result.insert(subject_var.clone(), subject); + result.insert(subject_var.clone(), triple.subject); result - }).collect() - } else { - Vec::new() - } - } else { - Vec::new() + }) + .collect() } } @@ -1098,25 +1099,18 @@ impl ExecutionEngine { predicate_var: String, object_var: String, ) -> Vec> { - // Strip '?' prefix from variable names let predicate_var = predicate_var.strip_prefix('?').unwrap_or(&predicate_var).to_string(); let object_var = object_var.strip_prefix('?').unwrap_or(&object_var).to_string(); - if let Some(pred_map) = database.index_manager.spo.get(&subject) { - // Clone variable names once before flat_map - pred_map.iter().flat_map(|(&predicate, objects)| { - let predicate_var = predicate_var.clone(); - let object_var = object_var.clone(); - objects.iter().map(move |&object| { - let mut result = HashMap::with_capacity(2); - result.insert(predicate_var.clone(), predicate); - result.insert(object_var.clone(), object); - result - }) - }).collect() - } else { - Vec::new() - } + database.index_manager.query(Some(subject), None, None) + .into_iter() + .map(|triple| { + let mut result = HashMap::with_capacity(2); + result.insert(predicate_var.clone(), triple.predicate); + result.insert(object_var.clone(), triple.object); + result + }) + .collect() } /// Scans P index (Predicate -> (Subject, Object)) @@ -1126,25 +1120,18 @@ impl ExecutionEngine { subject_var: String, object_var: String, ) -> Vec> { - // Strip '?' prefix from variable names let subject_var = subject_var.strip_prefix('?').unwrap_or(&subject_var).to_string(); let object_var = object_var.strip_prefix('?').unwrap_or(&object_var).to_string(); - if let Some(obj_map) = database.index_manager.pos.get(&predicate) { - // Clone variable names once before flat_map - obj_map.iter().flat_map(|(&object, subjects)| { - let subject_var = subject_var.clone(); - let object_var = object_var.clone(); - subjects.iter().map(move |&subject| { - let mut result = HashMap::with_capacity(2); - result.insert(subject_var.clone(), subject); - result.insert(object_var.clone(), object); - result - }) - }).collect() - } else { - Vec::new() - } + database.index_manager.query(None, Some(predicate), None) + .into_iter() + .map(|triple| { + let mut result = HashMap::with_capacity(2); + result.insert(subject_var.clone(), triple.subject); + result.insert(object_var.clone(), triple.object); + result + }) + .collect() } /// Scans O index (Object -> (Subject, Predicate)) @@ -1154,24 +1141,17 @@ impl ExecutionEngine { subject_var: String, predicate_var: String, ) -> Vec> { - // Strip '?' prefix from variable names let subject_var = subject_var.strip_prefix('?').unwrap_or(&subject_var).to_string(); let predicate_var = predicate_var.strip_prefix('?').unwrap_or(&predicate_var).to_string(); - if let Some(subj_map) = database.index_manager.osp.get(&object) { - // Clone variable names once before flat_map - subj_map.iter().flat_map(|(&subject, predicates)| { - let subject_var = subject_var.clone(); - let predicate_var = predicate_var.clone(); - predicates.iter().map(move |&predicate| { - let mut result = HashMap::with_capacity(2); - result.insert(subject_var.clone(), subject); - result.insert(predicate_var.clone(), predicate); - result - }) - }).collect() - } else { - Vec::new() - } + database.index_manager.query(None, None, Some(object)) + .into_iter() + .map(|triple| { + let mut result = HashMap::with_capacity(2); + result.insert(subject_var.clone(), triple.subject); + result.insert(predicate_var.clone(), triple.predicate); + result + }) + .collect() } } diff --git a/shared/src/index_manager/hexastore.rs b/shared/src/index_manager/hexastore.rs index 426f01e..6e5f35c 100644 --- a/shared/src/index_manager/hexastore.rs +++ b/shared/src/index_manager/hexastore.rs @@ -16,15 +16,15 @@ pub struct HexastoreIndex { } impl TripleIndex for HexastoreIndex { - fn new() -> Self { - Self { - spo: HashMap::new(), - pos: HashMap::new(), - osp: HashMap::new(), - pso: HashMap::new(), - ops: HashMap::new(), - sop: HashMap::new(), - } + fn clone_box(&self) -> Box { + Box::new(self.clone()) + } + + fn triple_count(&self) -> usize { + // Efficient: count directly from SPO index + self.spo.values() + .map(|pred_map| pred_map.values().map(|objs| objs.len()).sum::()) + .sum() } fn supported_access_patterns(&self) -> AccessPatternSupport { @@ -357,6 +357,17 @@ impl TripleIndex for HexastoreIndex { } impl HexastoreIndex { + pub fn new() -> Self { + Self { + spo: HashMap::new(), + pos: HashMap::new(), + osp: HashMap::new(), + pso: HashMap::new(), + ops: HashMap::new(), + sop: HashMap::new(), + } + } + /// Efficiently merge another index into this one using parallel processing where possible pub fn merge_from(&mut self, other: HexastoreIndex) { // Merge SPO index diff --git a/shared/src/index_manager/mod.rs b/shared/src/index_manager/mod.rs index adffc14..1a766b1 100644 --- a/shared/src/index_manager/mod.rs +++ b/shared/src/index_manager/mod.rs @@ -8,7 +8,6 @@ * you can obtain one at https://mozilla.org/MPL/2.0/. */ -use serde::{Serialize, Deserialize}; use std::collections::{HashMap, HashSet}; use crate::terms::*; use crate::terms::Term::*; @@ -30,9 +29,8 @@ pub struct AccessPatternSupport { pub op: bool, // object+predicate -> subjects } -pub trait TripleIndex: Send + Sync { +pub trait TripleIndex: Send + Sync + std::fmt::Debug { // ── Mutation ── - fn new() -> Self; fn insert(&mut self, triple: &Triple) -> bool; fn delete(&mut self, triple: &Triple) -> bool; fn clear(&mut self); @@ -74,6 +72,19 @@ pub trait TripleIndex: Send + Sync { // ── Metadata ── /// Reports which access patterns this index supports efficiently. fn supported_access_patterns(&self) -> AccessPatternSupport; + fn triple_count(&self) -> usize { + self.query(None, None, None).len() // default: expensive but correct + } + + // ── Cloning support for Box ── + fn clone_box(&self) -> Box; +} + +/// Allow `Clone` on `Box`. +impl Clone for Box { + fn clone(&self) -> Self { + self.clone_box() + } } /// Helper function to remove a triple from a nested index structure and clean up empty collections diff --git a/shared/src/index_manager/pso_single.rs b/shared/src/index_manager/pso_single.rs index 1c607f5..c3b7824 100644 --- a/shared/src/index_manager/pso_single.rs +++ b/shared/src/index_manager/pso_single.rs @@ -11,11 +11,15 @@ pub struct PSOSingleIndex { } impl TripleIndex for PSOSingleIndex { - fn new() -> Self { - Self { - pso: HashMap::new(), - } + fn clone_box(&self) -> Box { + Box::new(self.clone()) } + + fn triple_count(&self) -> usize { + self.pso.values() + .map(|sub_map| sub_map.values().map(|objs| objs.len()).sum::()) + .sum() + } fn supported_access_patterns(&self) -> AccessPatternSupport { AccessPatternSupport { @@ -267,6 +271,12 @@ impl TripleIndex for PSOSingleIndex { } impl PSOSingleIndex { + pub fn new() -> Self { + Self { + pso: HashMap::new(), + } + } + /// Efficiently merge another index into this one using parallel processing where possible pub fn merge_from(&mut self, other: PSOSingleIndex) { From bf5ee07783cd5986e56d9dea8ee056ac6910ac90 Mon Sep 17 00:00:00 2001 From: Mirovh <94124714+Mirovh@users.noreply.github.com> Date: Mon, 16 Feb 2026 02:38:42 +0100 Subject: [PATCH 04/23] bugfixes --- kolibrie/src/sparql_database.rs | 26 ++++---------------------- shared/src/index_manager/hexastore.rs | 4 ++++ shared/src/index_manager/mod.rs | 1 + shared/src/index_manager/pso_single.rs | 4 ++++ 4 files changed, 13 insertions(+), 22 deletions(-) diff --git a/kolibrie/src/sparql_database.rs b/kolibrie/src/sparql_database.rs index 3c920b9..8ae4b5f 100644 --- a/kolibrie/src/sparql_database.rs +++ b/kolibrie/src/sparql_database.rs @@ -1489,7 +1489,7 @@ impl SparqlDatabase { dictionary: merged_dictionary, prefixes: self.prefixes.clone(), udfs: HashMap::new(), - index_manager: Box::new(HexastoreIndex::new()), + index_manager: self.index_manager.clone_empty(), rule_map: HashMap::new(), cached_stats: None, } @@ -1558,7 +1558,7 @@ impl SparqlDatabase { dictionary: self.dictionary.clone(), prefixes: self.prefixes.clone(), udfs: HashMap::new(), - index_manager: Box::new(HexastoreIndex::new()), + index_manager: self.index_manager.clone_empty(), rule_map: HashMap::new(), cached_stats: None, } @@ -2945,26 +2945,8 @@ impl SparqlDatabase { // Get all triples as a vector for parallel processing let triples: Vec = self.triples.iter().cloned().collect(); - // Calculate optimal chunk size based on available cores and data size - let num_threads = rayon::current_num_threads(); - let chunk_size = (triples.len() / num_threads).max(1000); - - // Build indexes in parallel chunks - let partial_indexes: Vec<_> = triples - .par_chunks(chunk_size) - .map(|chunk| { - let mut local_index = shared::index_manager::HexastoreIndex::new(); - for triple in chunk { - local_index.insert(triple); - } - local_index - }) - .collect(); - - // Merge all partial indexes - for partial_index in partial_indexes { - self.index_manager.merge_from(partial_index); - } + self.index_manager.clear(); + self.index_manager.build_from_triples(&triples); // Optimize the final merged index self.index_manager.optimize(); diff --git a/shared/src/index_manager/hexastore.rs b/shared/src/index_manager/hexastore.rs index 6e5f35c..9bb4dbf 100644 --- a/shared/src/index_manager/hexastore.rs +++ b/shared/src/index_manager/hexastore.rs @@ -16,6 +16,10 @@ pub struct HexastoreIndex { } impl TripleIndex for HexastoreIndex { + fn clone_empty(&self) -> Box { + Box::new(HexastoreIndex::new()) + } + fn clone_box(&self) -> Box { Box::new(self.clone()) } diff --git a/shared/src/index_manager/mod.rs b/shared/src/index_manager/mod.rs index 1a766b1..524ad20 100644 --- a/shared/src/index_manager/mod.rs +++ b/shared/src/index_manager/mod.rs @@ -34,6 +34,7 @@ pub trait TripleIndex: Send + Sync + std::fmt::Debug { fn insert(&mut self, triple: &Triple) -> bool; fn delete(&mut self, triple: &Triple) -> bool; fn clear(&mut self); + fn clone_empty(&self) -> Box; // ── Pattern query ── /// Returns all triples matching the (s?, p?, o?) pattern. diff --git a/shared/src/index_manager/pso_single.rs b/shared/src/index_manager/pso_single.rs index c3b7824..729d717 100644 --- a/shared/src/index_manager/pso_single.rs +++ b/shared/src/index_manager/pso_single.rs @@ -11,6 +11,10 @@ pub struct PSOSingleIndex { } impl TripleIndex for PSOSingleIndex { + fn clone_empty(&self) -> Box { + Box::new(PSOSingleIndex::new()) + } + fn clone_box(&self) -> Box { Box::new(self.clone()) } From ee1b4b6f157befd101f39b7044a1ab4d72797643 Mon Sep 17 00:00:00 2001 From: Mirovh <94124714+Mirovh@users.noreply.github.com> Date: Mon, 16 Feb 2026 02:50:00 +0100 Subject: [PATCH 05/23] small fix --- shared/src/join_algorithm.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/shared/src/join_algorithm.rs b/shared/src/join_algorithm.rs index 6b82fe8..392af04 100644 --- a/shared/src/join_algorithm.rs +++ b/shared/src/join_algorithm.rs @@ -10,7 +10,7 @@ use crate::dictionary::Dictionary; use crate::triple::Triple; -use crate::index_manager::*; +use crate::index_manager::TripleIndex; use std::collections::{BTreeMap, HashMap}; use std::sync::Arc; use rayon::prelude::*; @@ -19,7 +19,7 @@ pub fn perform_join_par_simd_with_strict_filter_4_redesigned_streaming( subject_var: String, predicate: String, object_var: String, - index_manager: Box, // ← Pass index instead of database + index_manager: Box, // ← Pass index instead of database dictionary: &Dictionary, final_results: Vec>, literal_filter: Option, From 5a47e19bcb5ce0e7286060fbd41a48137afbea12 Mon Sep 17 00:00:00 2001 From: Mirovh Date: Tue, 3 Mar 2026 03:38:11 +0100 Subject: [PATCH 06/23] Add other single index permutations --- shared/src/index_manager/ops_single.rs | 328 +++++++++++++++++++++++++ shared/src/index_manager/osp_single.rs | 220 +++++++++++++++++ shared/src/index_manager/pos_single.rs | 220 +++++++++++++++++ shared/src/index_manager/sop_single.rs | 222 +++++++++++++++++ shared/src/index_manager/spo_single.rs | 253 +++++++++++++++++++ 5 files changed, 1243 insertions(+) create mode 100644 shared/src/index_manager/ops_single.rs create mode 100644 shared/src/index_manager/osp_single.rs create mode 100644 shared/src/index_manager/pos_single.rs create mode 100644 shared/src/index_manager/sop_single.rs create mode 100644 shared/src/index_manager/spo_single.rs diff --git a/shared/src/index_manager/ops_single.rs b/shared/src/index_manager/ops_single.rs new file mode 100644 index 0000000..93b5fc4 --- /dev/null +++ b/shared/src/index_manager/ops_single.rs @@ -0,0 +1,328 @@ +use serde::{Serialize, Deserialize}; +use std::collections::{HashMap, HashSet}; +use crate::terms::*; +use crate::triple::Triple; +use crate::index_manager::*; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct OPSSingleIndex { + // The six permutations, using HashMap of HashMap of HashSet. + pub ops: HashMap>>, +} + +impl TripleIndex for OPSSingleIndex { + fn clone_empty(&self) -> Box { + Box::new(OPSSingleIndex::new()) + } + + fn clone_box(&self) -> Box { + Box::new(self.clone()) + } + + fn triple_count(&self) -> usize { + self.ops.values() + .map(|sub_map| sub_map.values().map(|objs| objs.len()).sum::()) + .sum() + } + + fn supported_access_patterns(&self) -> AccessPatternSupport { + AccessPatternSupport { + sp: false, so: false, po: false, + ps: false, os: false, op: true + } + } + + /// Insert a single triple + fn insert(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + if let Some(pred_map) = self.ops.get(&o) { + if let Some(subjects) = pred_map.get(&p) { + if objects.contains(&s) { + return false; // triple already stored + } + } + } + self.ops.entry(o).or_default().entry(p).or_default().insert(s); + true + } + + /// Delete a single triple + fn delete(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + + let exists = self.ops + .get(&o) + .and_then(|pred_map| pred_map.get(&p)) + .map_or(false, |subjects| subjects.contains(&s)); + + if !exists { + return false; // triple doesn't exist + } + + // Remove from index using helper function + remove_from_index(&mut self.ops, s, p, o); + true + } + + /// Bulk-build the index from a list of triples + fn build_from_triples(&mut self, triples: &[Triple]) { + for triple in triples { + self.insert(triple); + } + use rayon::prelude::*; + + self.clear(); + + if triples.is_empty() { + return; + } + + // Pre-allocate with capacity estimates + let capacity = triples.len() / 100; + + self.ops.reserve(capacity); + + // Build indexes in parallel by creating partial indexes and merging + let num_threads = rayon::current_num_threads(); + let chunk_size = (triples.len() / num_threads).max(10_000); + + let partial_indexes: Vec = triples + .par_chunks(chunk_size) + .map(|chunk| { + let mut local_index = OPSSingleIndex::new(); + + // Pre-allocate local index + let local_capacity = chunk.len() / 50; + local_index.ops.reserve(local_capacity); + + // Insert triples into local index + for triple in chunk { + local_index.insert_optimized(triple); + } + + local_index + }) + .collect(); + + // Sequentially merge partial indexes + for partial_index in partial_indexes { + self.merge_from(partial_index); + } + + // Optimize memory layout after building + self.optimize_post_build(); + } + + /// Query the index + fn query(&self, s: Option, p: Option, o: Option) -> Vec { + let mut results = Vec::new(); + + match (s, p, o) { + // Fully bound + (Some(ss), Some(pp), Some(oo)) => { + if let Some(pred_map) = self.ops.get(&oo) { + if let Some(subjects) = pred_map.get(&pp) { + if subjects.contains(&ss) { + results.push(Triple { subject: ss, predicate: pp, object: oo }); + } + } + } + } + // (S, P, -) + (Some(ss), Some(pp), None) => { + for (&obj, pred_map) in &self.ops { + if let Some(subjects) = pred_map.get(&pp) { + if subjects.contains(&ss) { + results.push(Triple { subject: ss, predicate: pp, object: obj }); + } + } + } + } + // (S, -, O) + (Some(ss), None, Some(oo)) => { + if let Some(pred_map) = self.ops.get(&oo) { + for (&pred, subjects) in pred_map { + if subjects.contains(&ss) { + results.push(Triple { subject: ss, predicate: pred, object: oo }); + } + } + } + } + // (-, P, O) + (None, Some(pp), Some(oo)) => { + if let Some(pred_map) = self.ops.get(&oo) { + if let Some(subjects) = pred_map.get(&pp) { + for &subj in subjects { + results.push(Triple { subject: subj, predicate: pp, object: oo }); + } + } + } + } + // (S, -, -) + (Some(ss), None, None) => { + for (&obj, pred_map) in self.ops { + for (&pred, subjects) in pred_map { + if subjects.contains(&ss) { + results.push( Triple { subject: ss, predicate: pred, object: obj }); + } + } + } + } + // (-, P, -) + (None, Some(pp), None) => { + for (&obj, pred_map) in self.ops { + if let Some(subjects) = pred_map.get(&pp) { + for &subj in subjects { + results.push(Triple { subject: subj, predicate: pp, object: obj }); + } + } + } + } + // (-, -, O) + (None, None, Some(oo)) => { + if let Some(pred_map) = self.ops.get(&oo) { + for (&pred, subjects) in pred_map { + for &subj in subjects { + results.push(Triple { subject: subj, predicate: pred, object: oo }); + } + } + } + } + // (-, -, -) => all + (None, None, None) => { + for (&obj, pred_map) in self.ops { + for (&pred, subjects) in pred_map { + for &subj in subjects { + results.push(Triple { subject: subj, predicate: pred, object: obj }); + } + } + } + } + } + + results + } + + /// Return all triples that match a given `TriplePattern` + fn get_matching_triples(&self, pattern: &TriplePattern) -> Vec { + let (s, p, o) = pattern; + let sub = match s { + Constant(x) => Some(*x), + Variable(_) => None, + }; + let pre = match p { + Constant(x) => Some(*x), + Variable(_) => None, + }; + let obj = match o { + Constant(x) => Some(*x), + Variable(_) => None, + }; + + self.query(sub, pre, obj) + } + + /// Clear all data in the indexes + fn clear(&mut self) { + self.ops.clear(); + } + + /// Scan using the Subject-Predicate index (spo) + fn scan_sp(&self, _: u32, _: u32) -> Option<&HashSet> { + return None; + } + + /// Scan using the Subject-Object index (sop) + fn scan_so(&self, _: u32, _: u32) -> Option<&HashSet> { + return None; + } + + /// Scan using the Predicate-Object index (pos) + fn scan_po(&self, _: u32, _: u32) -> Option<&HashSet> { + return None; + } + + /// Scan using the Predicate-Subject index (pso) + fn scan_ps(&self, _: u32, _: u32) -> Option<&HashSet> { + return None; + } + + /// Scan using the Object-Subject index (osp) + fn scan_os(&self, _: u32, _: u32) -> Option<&HashSet> { + return None; + } + + /// Scan using the Object-Predicate index (ops) + fn scan_op(&self, o: u32, p: u32) -> Option<&HashSet> { + self.ops + .get(&o) + .and_then(|pred_map| pred_map.get(&p)); + } + + fn optimize(&mut self) { + use rayon::prelude::*; + + // Optimize PSO index + self.ops.par_iter_mut().for_each(|(_, pred_map)| { + pred_map.par_iter_mut().for_each(|(_, subj_set)| { + subj_set.shrink_to_fit(); + }); + pred_map.shrink_to_fit(); + }); + self.ops.shrink_to_fit(); + } + +} + +impl OPSSingleIndex { + pub fn new() -> Self { + Self { + ops: HashMap::new(), + } + } + + /// Efficiently merge another index into this one using parallel processing where possible + pub fn merge_from(&mut self, other: OPSSingleIndex) { + + // Merge OPS index + for (o, pred_map) in other.ops { + let entry = self.ops.entry(o).or_insert_with(HashMap::new); + for (p, subj_set) in pred_map { + entry.entry(p).or_insert_with(HashSet::new).extend(subj_set); + } + } + } + + #[inline] + fn insert_optimized(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + + // Check for duplicates + if let Some(pred_map) = self.ops.get(&o) { + if let Some(subjects) = pred_map.get(&p) { + if subjects.contains(&s) { + return false; + } + } + } + + // Batch insert into all indexes + self.ops.entry(o).or_insert_with(|| HashMap::with_capacity(16)) + .entry(p).or_insert_with(|| HashSet::with_capacity(8)) + .insert(s); + + true + } + + fn optimize_post_build(&mut self) { + use rayon::prelude::*; + + self.ops.par_iter_mut().for_each(|(_, pred_map)| { + pred_map.shrink_to_fit(); + pred_map.par_iter_mut().for_each(|(_, subj_set)| { + subj_set.shrink_to_fit(); + }); + }); + self.ops.shrink_to_fit(); + } +} diff --git a/shared/src/index_manager/osp_single.rs b/shared/src/index_manager/osp_single.rs new file mode 100644 index 0000000..811997e --- /dev/null +++ b/shared/src/index_manager/osp_single.rs @@ -0,0 +1,220 @@ +use serde::{Serialize, Deserialize}; +use std::collections::{HashMap, HashSet}; +use crate::terms::*; +use crate::triple::Triple; +use crate::index_manager::*; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct OSPSingleIndex { + pub osp: HashMap>>, +} + +impl TripleIndex for OSPSingleIndex { + fn clone_empty(&self) -> Box { Box::new(OSPSingleIndex::new()) } + fn clone_box(&self) -> Box { Box::new(self.clone()) } + + fn triple_count(&self) -> usize { + self.osp.values().map(|sub_map| sub_map.values().map(|ps| ps.len()).sum::()).sum() + } + + fn supported_access_patterns(&self) -> AccessPatternSupport { + AccessPatternSupport { + sp: false, so: false, po: false, + ps: false, os: true, op: false + } + } + + fn insert(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + if let Some(sub_map) = self.osp.get(&o) { + if let Some(preds) = sub_map.get(&s) { + if preds.contains(&p) { return false; } + } + } + self.osp.entry(o).or_default().entry(s).or_default().insert(p); + true + } + + fn delete(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + if let Some(sub_map) = self.osp.get_mut(&o) { + if let Some(preds) = sub_map.get_mut(&s) { + if preds.remove(&p) { + if preds.is_empty() { sub_map.remove(&s); } + if sub_map.is_empty() { self.osp.remove(&o); } + return true; + } + } + } + false + } + + fn build_from_triples(&mut self, triples: &[Triple]) { + use rayon::prelude::*; + self.clear(); + if triples.is_empty() { return; } + + let capacity = (triples.len() / 100).max(1); + self.osp.reserve(capacity); + + let num_threads = rayon::current_num_threads(); + let chunk_size = (triples.len() / num_threads).max(10_000); + + let partials: Vec = triples + .par_chunks(chunk_size) + .map(|chunk| { + let mut local = OSPSingleIndex::new(); + local.osp.reserve((chunk.len() / 50).max(1)); + for t in chunk { local.insert_optimized(t); } + local + }) + .collect(); + + for p in partials { self.merge_from(p); } + self.optimize_post_build(); + } + + fn query(&self, s: Option, p: Option, o: Option) -> Vec { + let mut results = Vec::new(); + match (s, p, o) { + (Some(ss), Some(pp), Some(oo)) => { + if let Some(sub_map) = self.osp.get(&oo) { + if let Some(preds) = sub_map.get(&ss) { + if preds.contains(&pp) { + results.push(Triple { subject: ss, predicate: pp, object: oo }); + } + } + } + } + (Some(ss), Some(pp), None) => { + for (&obj, sub_map) in &self.osp { + if let Some(preds) = sub_map.get(&ss) { + if preds.contains(&pp) { + results.push(Triple { subject: ss, predicate: pp, object: obj }); + } + } + } + } + (Some(ss), None, Some(oo)) => { + if let Some(sub_map) = self.osp.get(&oo) { + if let Some(preds) = sub_map.get(&ss) { + for &pred in preds { + results.push(Triple { subject: ss, predicate: pred, object: oo }); + } + } + } + } + (None, Some(pp), Some(oo)) => { + if let Some(sub_map) = self.osp.get(&oo) { + for (&sub, preds) in sub_map { + if preds.contains(&pp) { + results.push(Triple { subject: sub, predicate: pp, object: oo }); + } + } + } + } + (Some(ss), None, None) => { + for (&obj, sub_map) in &self.osp { + if let Some(preds) = sub_map.get(&ss) { + for &pred in preds { + results.push(Triple { subject: ss, predicate: pred, object: obj }); + } + } + } + } + (None, Some(pp), None) => { + for (&obj, sub_map) in &self.osp { + for (&sub, preds) in sub_map { + if preds.contains(&pp) { + results.push(Triple { subject: sub, predicate: pp, object: obj }); + } + } + } + } + (None, None, Some(oo)) => { + if let Some(sub_map) = self.osp.get(&oo) { + for (&sub, preds) in sub_map { + for &pred in preds { + results.push(Triple { subject: sub, predicate: pred, object: oo }); + } + } + } + } + (None, None, None) => { + for (&obj, sub_map) in &self.osp { + for (&sub, preds) in sub_map { + for &pred in preds { + results.push(Triple { subject: sub, predicate: pred, object: obj }); + } + } + } + } + } + results + } + + fn get_matching_triples(&self, pattern: &TriplePattern) -> Vec { + let (s, p, o) = pattern; + let sub = match s { Constant(x) => Some(*x), Variable(_) => None }; + let pre = match p { Constant(x) => Some(*x), Variable(_) => None }; + let obj = match o { Constant(x) => Some(*x), Variable(_) => None }; + self.query(sub, pre, obj) + } + + fn clear(&mut self) { self.osp.clear(); } + + fn scan_os(&self, o: u32, s: u32) -> Option<&HashSet> { + self.osp.get(&o).and_then(|sub_map| sub_map.get(&s)) + } + + fn scan_sp(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_so(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_po(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_ps(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_op(&self, _: u32, _: u32) -> Option<&HashSet> { None } + + fn optimize(&mut self) { + use rayon::prelude::*; + self.osp.par_iter_mut().for_each(|(_, sub_map)| { + sub_map.par_iter_mut().for_each(|(_, preds)| { preds.shrink_to_fit(); }); + sub_map.shrink_to_fit(); + }); + self.osp.shrink_to_fit(); + } +} + +impl OSPSingleIndex { + pub fn new() -> Self { Self { osp: HashMap::new() } } + + pub fn merge_from(&mut self, other: OSPSingleIndex) { + for (o, sub_map) in other.osp { + let entry = self.osp.entry(o).or_insert_with(HashMap::new); + for (s, preds) in sub_map { + entry.entry(s).or_insert_with(HashSet::new).extend(preds); + } + } + } + + #[inline] + fn insert_optimized(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + if let Some(sub_map) = self.osp.get(&o) { + if let Some(preds) = sub_map.get(&s) { + if preds.contains(&p) { return false; } + } + } + self.osp.entry(o).or_insert_with(|| HashMap::with_capacity(16)) + .entry(s).or_insert_with(|| HashSet::with_capacity(8)) + .insert(p); + true + } + + fn optimize_post_build(&mut self) { + use rayon::prelude::*; + self.osp.par_iter_mut().for_each(|(_, sub_map)| { + sub_map.shrink_to_fit(); + sub_map.par_iter_mut().for_each(|(_, preds)| { preds.shrink_to_fit(); }); + }); + self.osp.shrink_to_fit(); + } +} diff --git a/shared/src/index_manager/pos_single.rs b/shared/src/index_manager/pos_single.rs new file mode 100644 index 0000000..9cc3306 --- /dev/null +++ b/shared/src/index_manager/pos_single.rs @@ -0,0 +1,220 @@ +use serde::{Serialize, Deserialize}; +use std::collections::{HashMap, HashSet}; +use crate::terms::*; +use crate::triple::Triple; +use crate::index_manager::*; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct POSSingleIndex { + pub pos: HashMap>>, +} + +impl TripleIndex for POSSingleIndex { + fn clone_empty(&self) -> Box { Box::new(POSSingleIndex::new()) } + fn clone_box(&self) -> Box { Box::new(self.clone()) } + + fn triple_count(&self) -> usize { + self.pos.values().map(|obj_map| obj_map.values().map(|subs| subs.len()).sum::()).sum() + } + + fn supported_access_patterns(&self) -> AccessPatternSupport { + AccessPatternSupport { + sp: false, so: false, po: true, + ps: false, os: false, op: false + } + } + + fn insert(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + if let Some(obj_map) = self.pos.get(&p) { + if let Some(subs) = obj_map.get(&o) { + if subs.contains(&s) { return false; } + } + } + self.pos.entry(p).or_default().entry(o).or_default().insert(s); + true + } + + fn delete(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + if let Some(obj_map) = self.pos.get_mut(&p) { + if let Some(subs) = obj_map.get_mut(&o) { + if subs.remove(&s) { + if subs.is_empty() { obj_map.remove(&o); } + if obj_map.is_empty() { self.pos.remove(&p); } + return true; + } + } + } + false + } + + fn build_from_triples(&mut self, triples: &[Triple]) { + use rayon::prelude::*; + self.clear(); + if triples.is_empty() { return; } + + let capacity = (triples.len() / 100).max(1); + self.pos.reserve(capacity); + + let num_threads = rayon::current_num_threads(); + let chunk_size = (triples.len() / num_threads).max(10_000); + + let partials: Vec = triples + .par_chunks(chunk_size) + .map(|chunk| { + let mut local = POSSingleIndex::new(); + local.pos.reserve((chunk.len() / 50).max(1)); + for t in chunk { local.insert_optimized(t); } + local + }) + .collect(); + + for p in partials { self.merge_from(p); } + self.optimize_post_build(); + } + + fn query(&self, s: Option, p: Option, o: Option) -> Vec { + let mut results = Vec::new(); + match (s, p, o) { + (Some(ss), Some(pp), Some(oo)) => { + if let Some(obj_map) = self.pos.get(&pp) { + if let Some(subs) = obj_map.get(&oo) { + if subs.contains(&ss) { + results.push(Triple { subject: ss, predicate: pp, object: oo }); + } + } + } + } + (Some(ss), Some(pp), None) => { + if let Some(obj_map) = self.pos.get(&pp) { + for (&obj, subs) in obj_map { + if subs.contains(&ss) { + results.push(Triple { subject: ss, predicate: pp, object: obj }); + } + } + } + } + (Some(ss), None, Some(oo)) => { + for (&pred, obj_map) in &self.pos { + if let Some(subs) = obj_map.get(&oo) { + if subs.contains(&ss) { + results.push(Triple { subject: ss, predicate: pred, object: oo }); + } + } + } + } + (None, Some(pp), Some(oo)) => { + if let Some(obj_map) = self.pos.get(&pp) { + if let Some(subs) = obj_map.get(&oo) { + for &sub in subs { + results.push(Triple { subject: sub, predicate: pp, object: oo }); + } + } + } + } + (Some(ss), None, None) => { + for (&pred, obj_map) in &self.pos { + for (&obj, subs) in obj_map { + if subs.contains(&ss) { + results.push(Triple { subject: ss, predicate: pred, object: obj }); + } + } + } + } + (None, Some(pp), None) => { + if let Some(obj_map) = self.pos.get(&pp) { + for (&obj, subs) in obj_map { + for &sub in subs { + results.push(Triple { subject: sub, predicate: pp, object: obj }); + } + } + } + } + (None, None, Some(oo)) => { + for (&pred, obj_map) in &self.pos { + if let Some(subs) = obj_map.get(&oo) { + for &sub in subs { + results.push(Triple { subject: sub, predicate: pred, object: oo }); + } + } + } + } + (None, None, None) => { + for (&pred, obj_map) in &self.pos { + for (&obj, subs) in obj_map { + for &sub in subs { + results.push(Triple { subject: sub, predicate: pred, object: obj }); + } + } + } + } + } + results + } + + fn get_matching_triples(&self, pattern: &TriplePattern) -> Vec { + let (s, p, o) = pattern; + let sub = match s { Constant(x) => Some(*x), Variable(_) => None }; + let pre = match p { Constant(x) => Some(*x), Variable(_) => None }; + let obj = match o { Constant(x) => Some(*x), Variable(_) => None }; + self.query(sub, pre, obj) + } + + fn clear(&mut self) { self.pos.clear(); } + + fn scan_po(&self, p: u32, o: u32) -> Option<&HashSet> { + self.pos.get(&p).and_then(|obj_map| obj_map.get(&o)) + } + + fn scan_sp(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_so(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_ps(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_os(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_op(&self, _: u32, _: u32) -> Option<&HashSet> { None } + + fn optimize(&mut self) { + use rayon::prelude::*; + self.pos.par_iter_mut().for_each(|(_, obj_map)| { + obj_map.par_iter_mut().for_each(|(_, subs)| { subs.shrink_to_fit(); }); + obj_map.shrink_to_fit(); + }); + self.pos.shrink_to_fit(); + } +} + +impl POSSingleIndex { + pub fn new() -> Self { Self { pos: HashMap::new() } } + + pub fn merge_from(&mut self, other: POSSingleIndex) { + for (p, obj_map) in other.pos { + let entry = self.pos.entry(p).or_insert_with(HashMap::new); + for (o, subs) in obj_map { + entry.entry(o).or_insert_with(HashSet::new).extend(subs); + } + } + } + + #[inline] + fn insert_optimized(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + if let Some(obj_map) = self.pos.get(&p) { + if let Some(subs) = obj_map.get(&o) { + if subs.contains(&s) { return false; } + } + } + self.pos.entry(p).or_insert_with(|| HashMap::with_capacity(16)) + .entry(o).or_insert_with(|| HashSet::with_capacity(8)) + .insert(s); + true + } + + fn optimize_post_build(&mut self) { + use rayon::prelude::*; + self.pos.par_iter_mut().for_each(|(_, obj_map)| { + obj_map.shrink_to_fit(); + obj_map.par_iter_mut().for_each(|(_, subs)| { subs.shrink_to_fit(); }); + }); + self.pos.shrink_to_fit(); + } +} diff --git a/shared/src/index_manager/sop_single.rs b/shared/src/index_manager/sop_single.rs new file mode 100644 index 0000000..22e24f6 --- /dev/null +++ b/shared/src/index_manager/sop_single.rs @@ -0,0 +1,222 @@ +use serde::{Serialize, Deserialize}; +use std::collections::{HashMap, HashSet}; +use crate::terms::*; +use crate::triple::Triple; +use crate::index_manager::*; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SOPSingleIndex { + pub sop: HashMap>>, +} + +impl TripleIndex for SOPSingleIndex { + fn clone_empty(&self) -> Box { Box::new(SOPSingleIndex::new()) } + fn clone_box(&self) -> Box { Box::new(self.clone()) } + + fn triple_count(&self) -> usize { + self.sop.values().map(|obj_map| obj_map.values().map(|ps| ps.len()).sum::()).sum() + } + + fn supported_access_patterns(&self) -> AccessPatternSupport { + AccessPatternSupport { + sp: false, so: true, po: false, + ps: false, os: false, op: false + } + } + + fn insert(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + if let Some(obj_map) = self.sop.get(&s) { + if let Some(preds) = obj_map.get(&o) { + if preds.contains(&p) { return false; } + } + } + self.sop.entry(s).or_default().entry(o).or_default().insert(p); + true + } + + fn delete(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + if let Some(obj_map) = self.sop.get_mut(&s) { + if let Some(preds) = obj_map.get_mut(&o) { + if preds.remove(&p) { + if preds.is_empty() { obj_map.remove(&o); } + if obj_map.is_empty() { self.sop.remove(&s); } + return true; + } + } + } + false + } + + fn build_from_triples(&mut self, triples: &[Triple]) { + use rayon::prelude::*; + self.clear(); + if triples.is_empty() { return; } + + let capacity = (triples.len() / 100).max(1); + self.sop.reserve(capacity); + + let num_threads = rayon::current_num_threads(); + let chunk_size = (triples.len() / num_threads).max(10_000); + + let partials: Vec = triples + .par_chunks(chunk_size) + .map(|chunk| { + let mut local = SOPSingleIndex::new(); + local.sop.reserve((chunk.len() / 50).max(1)); + for t in chunk { local.insert_optimized(t); } + local + }) + .collect(); + + for p in partials { self.merge_from(p); } + self.optimize_post_build(); + } + + fn query(&self, s: Option, p: Option, o: Option) -> Vec { + let mut results = Vec::new(); + match (s, p, o) { + (Some(ss), Some(pp), Some(oo)) => { + if let Some(obj_map) = self.sop.get(&ss) { + if let Some(preds) = obj_map.get(&oo) { + if preds.contains(&pp) { + results.push(Triple { subject: ss, predicate: pp, object: oo }); + } + } + } + } + (Some(ss), Some(pp), None) => { + if let Some(obj_map) = self.sop.get(&ss) { + for (&obj, preds) in obj_map { + if preds.contains(&pp) { + results.push(Triple { subject: ss, predicate: pp, object: obj }); + } + } + } + } + (Some(ss), None, Some(oo)) => { + if let Some(obj_map) = self.sop.get(&ss) { + if let Some(preds) = obj_map.get(&oo) { + for &pred in preds { + results.push(Triple { subject: ss, predicate: pred, object: oo }); + } + } + } + } + (None, Some(pp), Some(oo)) => { + for (&sub, obj_map) in &self.sop { + if let Some(preds) = obj_map.get(&oo) { + if preds.contains(&pp) { + results.push(Triple { subject: sub, predicate: pp, object: oo }); + } + } + } + } + (Some(ss), None, None) => { + if let Some(obj_map) = self.sop.get(&ss) { + for (&obj, preds) in obj_map { + for &pred in preds { + results.push(Triple { subject: ss, predicate: pred, object: obj }); + } + } + } + } + (None, Some(pp), None) => { + for (&sub, obj_map) in &self.sop { + for (&obj, preds) in obj_map { + if preds.contains(&pp) { + results.push(Triple { subject: sub, predicate: pp, object: obj }); + } + } + } + } + (None, None, Some(oo)) => { + for (&sub, obj_map) in &self.sop { + if let Some(preds) = obj_map.get(&oo) { + for &pred in preds { + results.push(Triple { subject: sub, predicate: pred, object: oo }); + } + } + } + } + (None, None, None) => { + for (&sub, obj_map) in &self.sop { + for (&obj, preds) in obj_map { + for &pred in preds { + results.push(Triple { subject: sub, predicate: pred, object: obj }); + } + } + } + } + } + results + } + + fn get_matching_triples(&self, pattern: &TriplePattern) -> Vec { + let (s, p, o) = pattern; + let sub = match s { Constant(x) => Some(*x), Variable(_) => None }; + let pre = match p { Constant(x) => Some(*x), Variable(_) => None }; + let obj = match o { Constant(x) => Some(*x), Variable(_) => None }; + self.query(sub, pre, obj) + } + + fn clear(&mut self) { self.sop.clear(); } + + fn scan_so(&self, s: u32, o: u32) -> Option<&HashSet> { + self.sop.get(&s).and_then(|obj_map| obj_map.get(&o)) + } + + fn scan_sp(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_po(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_ps(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_os(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_op(&self, _: u32, _: u32) -> Option<&HashSet> { None } + + fn optimize(&mut self) { + use rayon::prelude::*; + self.sop.par_iter_mut().for_each(|(_, obj_map)| { + obj_map.par_iter_mut().for_each(|(_, preds)| { + preds.shrink_to_fit(); + }); + obj_map.shrink_to_fit(); + }); + self.sop.shrink_to_fit(); + } +} + +impl SOPSingleIndex { + pub fn new() -> Self { Self { sop: HashMap::new() } } + + pub fn merge_from(&mut self, other: SOPSingleIndex) { + for (s, obj_map) in other.sop { + let entry = self.sop.entry(s).or_insert_with(HashMap::new); + for (o, preds) in obj_map { + entry.entry(o).or_insert_with(HashSet::new).extend(preds); + } + } + } + + #[inline] + fn insert_optimized(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + if let Some(obj_map) = self.sop.get(&s) { + if let Some(preds) = obj_map.get(&o) { + if preds.contains(&p) { return false; } + } + } + self.sop.entry(s).or_insert_with(|| HashMap::with_capacity(16)) + .entry(o).or_insert_with(|| HashSet::with_capacity(8)) + .insert(p); + true + } + + fn optimize_post_build(&mut self) { + use rayon::prelude::*; + self.sop.par_iter_mut().for_each(|(_, obj_map)| { + obj_map.shrink_to_fit(); + obj_map.par_iter_mut().for_each(|(_, preds)| { preds.shrink_to_fit(); }); + }); + self.sop.shrink_to_fit(); + } +} diff --git a/shared/src/index_manager/spo_single.rs b/shared/src/index_manager/spo_single.rs new file mode 100644 index 0000000..10a25f4 --- /dev/null +++ b/shared/src/index_manager/spo_single.rs @@ -0,0 +1,253 @@ +use serde::{Serialize, Deserialize}; +use std::collections::{HashMap, HashSet}; +use crate::terms::*; +use crate::triple::Triple; +use crate::index_manager::*; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SPOSingleIndex { + pub spo: HashMap>>, +} + +impl TripleIndex for SPOSingleIndex { + fn clone_empty(&self) -> Box { + Box::new(SPOSingleIndex::new()) + } + + fn clone_box(&self) -> Box { + Box::new(self.clone()) + } + + fn triple_count(&self) -> usize { + self.spo.values() + .map(|pred_map| pred_map.values().map(|objs| objs.len()).sum::()) + .sum() + } + + fn supported_access_patterns(&self) -> AccessPatternSupport { + AccessPatternSupport { + sp: true, so: false, po: false, + ps: false, os: false, op: false + } + } + + fn insert(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + if let Some(pred_map) = self.spo.get(&s) { + if let Some(objects) = pred_map.get(&p) { + if objects.contains(&o) { + return false; + } + } + } + self.spo.entry(s).or_default().entry(p).or_default().insert(o); + true + } + + fn delete(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + + if let Some(pred_map) = self.spo.get_mut(&s) { + if let Some(obj_set) = pred_map.get_mut(&p) { + if obj_set.remove(&o) { + // cleanup empty maps + if obj_set.is_empty() { + pred_map.remove(&p); + } + if pred_map.is_empty() { + self.spo.remove(&s); + } + return true; + } + } + } + false + } + + fn build_from_triples(&mut self, triples: &[Triple]) { + use rayon::prelude::*; + + self.clear(); + if triples.is_empty() { + return; + } + + let capacity = (triples.len() / 100).max(1); + self.spo.reserve(capacity); + + let num_threads = rayon::current_num_threads(); + let chunk_size = (triples.len() / num_threads).max(10_000); + + let partials: Vec = triples + .par_chunks(chunk_size) + .map(|chunk| { + let mut local = SPOSingleIndex::new(); + let local_capacity = (chunk.len() / 50).max(1); + local.spo.reserve(local_capacity); + + for t in chunk { + local.insert_optimized(t); + } + local + }) + .collect(); + + for part in partials { + self.merge_from(part); + } + + self.optimize_post_build(); + } + + fn query(&self, s: Option, p: Option, o: Option) -> Vec { + let mut results = Vec::new(); + match (s, p, o) { + (Some(ss), Some(pp), Some(oo)) => { + if let Some(pred_map) = self.spo.get(&ss) { + if let Some(objs) = pred_map.get(&pp) { + if objs.contains(&oo) { + results.push(Triple { subject: ss, predicate: pp, object: oo }); + } + } + } + } + (Some(ss), Some(pp), None) => { + if let Some(pred_map) = self.spo.get(&ss) { + if let Some(objs) = pred_map.get(&pp) { + for &obj in objs { + results.push(Triple { subject: ss, predicate: pp, object: obj }); + } + } + } + } + (Some(ss), None, Some(oo)) => { + if let Some(pred_map) = self.spo.get(&ss) { + for (&pred, objs) in pred_map { + if objs.contains(&oo) { + results.push(Triple { subject: ss, predicate: pred, object: oo }); + } + } + } + } + (None, Some(pp), Some(oo)) => { + for (&sub, pred_map) in &self.spo { + if let Some(objs) = pred_map.get(&pp) { + if objs.contains(&oo) { + results.push(Triple { subject: sub, predicate: pp, object: oo }); + } + } + } + } + (Some(ss), None, None) => { + if let Some(pred_map) = self.spo.get(&ss) { + for (&pred, objs) in pred_map { + for &obj in objs { + results.push(Triple { subject: ss, predicate: pred, object: obj }); + } + } + } + } + (None, Some(pp), None) => { + for (&sub, pred_map) in &self.spo { + if let Some(objs) = pred_map.get(&pp) { + for &obj in objs { + results.push(Triple { subject: sub, predicate: pp, object: obj }); + } + } + } + } + (None, None, Some(oo)) => { + for (&sub, pred_map) in &self.spo { + for (&pred, objs) in pred_map { + if objs.contains(&oo) { + results.push(Triple { subject: sub, predicate: pred, object: oo }); + } + } + } + } + (None, None, None) => { + for (&sub, pred_map) in &self.spo { + for (&pred, objs) in pred_map { + for &obj in objs { + results.push(Triple { subject: sub, predicate: pred, object: obj }); + } + } + } + } + } + results + } + + fn get_matching_triples(&self, pattern: &TriplePattern) -> Vec { + let (s, p, o) = pattern; + let sub = match s { Constant(x) => Some(*x), Variable(_) => None }; + let pre = match p { Constant(x) => Some(*x), Variable(_) => None }; + let obj = match o { Constant(x) => Some(*x), Variable(_) => None }; + self.query(sub, pre, obj) + } + + fn clear(&mut self) { + self.spo.clear(); + } + + fn scan_sp(&self, s: u32, p: u32) -> Option<&HashSet> { + self.spo.get(&s).and_then(|pred_map| pred_map.get(&p)) + } + + fn scan_so(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_po(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_ps(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_os(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_op(&self, _: u32, _: u32) -> Option<&HashSet> { None } + + fn optimize(&mut self) { + use rayon::prelude::*; + self.spo.par_iter_mut().for_each(|(_, pred_map)| { + pred_map.par_iter_mut().for_each(|(_, obj_set)| { + obj_set.shrink_to_fit(); + }); + pred_map.shrink_to_fit(); + }); + self.spo.shrink_to_fit(); + } +} + +impl SPOSingleIndex { + pub fn new() -> Self { + Self { spo: HashMap::new() } + } + + pub fn merge_from(&mut self, other: SPOSingleIndex) { + for (s, pred_map) in other.spo { + let entry = self.spo.entry(s).or_insert_with(HashMap::new); + for (p, obj_set) in pred_map { + entry.entry(p).or_insert_with(HashSet::new).extend(obj_set); + } + } + } + + #[inline] + fn insert_optimized(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + if let Some(pred_map) = self.spo.get(&s) { + if let Some(objs) = pred_map.get(&p) { + if objs.contains(&o) { return false; } + } + } + self.spo.entry(s).or_insert_with(|| HashMap::with_capacity(16)) + .entry(p).or_insert_with(|| HashSet::with_capacity(8)) + .insert(o); + true + } + + fn optimize_post_build(&mut self) { + use rayon::prelude::*; + self.spo.par_iter_mut().for_each(|(_, pred_map)| { + pred_map.shrink_to_fit(); + pred_map.par_iter_mut().for_each(|(_, obj_set)| { + obj_set.shrink_to_fit(); + }); + }); + self.spo.shrink_to_fit(); + } +} From 02e8f2d753610a4dc2bc218e572e5d5d23de85a8 Mon Sep 17 00:00:00 2001 From: Mirovh Date: Tue, 3 Mar 2026 03:41:19 +0100 Subject: [PATCH 07/23] Add new single index permutation exports --- shared/src/index_manager/mod.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/shared/src/index_manager/mod.rs b/shared/src/index_manager/mod.rs index 524ad20..34abc54 100644 --- a/shared/src/index_manager/mod.rs +++ b/shared/src/index_manager/mod.rs @@ -14,9 +14,20 @@ use crate::terms::Term::*; use crate::triple::Triple; pub use hexastore::HexastoreIndex; +pub use ops_single::OPSSingleIndex; +pub use osp_single::OSPSingleIndex; +pub use pos_single::POSSingleIndex; pub use pso_single::PSOSingleIndex; +pub use sop_single::SOPSingleIndex; +pub use spo_single::SPOSingleIndex; pub mod hexastore; +pub mod ops_single; +pub mod osp_single; +pub mod pos_single; pub mod pso_single; +pub mod sop_single; +pub mod spo_single; + /// Describes which access patterns an index can serve efficiently. #[derive(Debug, Clone)] From bce15f517f5e8cd536cd5a8f6ba65ca23f138c89 Mon Sep 17 00:00:00 2001 From: Mirovh Date: Wed, 4 Mar 2026 20:51:39 +0100 Subject: [PATCH 08/23] Add single table index, idea for dynamic hexastore and bench all indexing methods --- .gitignore | 1 + .../n_triples_data/n_triple_10M.rs | 439 ++++++++++-------- .../n_triple_10M_all_indexes.sh | 57 +++ shared/src/index_manager/dynamic_hexastore.rs | 49 ++ shared/src/index_manager/mod.rs | 2 + shared/src/index_manager/ops_single.rs | 12 +- shared/src/index_manager/single_table.rs | 149 ++++++ 7 files changed, 498 insertions(+), 211 deletions(-) create mode 100755 kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh create mode 100644 shared/src/index_manager/dynamic_hexastore.rs create mode 100644 shared/src/index_manager/single_table.rs diff --git a/.gitignore b/.gitignore index 631ef77..2eb1831 100644 --- a/.gitignore +++ b/.gitignore @@ -85,6 +85,7 @@ python/.venv/ # Some other directories benchmark_dataset/ +kolibrie/examples/sparql_syntax/n_triples_data/benchmark_results/ # IntelliJ .idea/ diff --git a/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M.rs b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M.rs index dbd23e2..49695bd 100644 --- a/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M.rs +++ b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M.rs @@ -1,111 +1,140 @@ /* - * Copyright © 2025 Volodymyr Kadzhaia - * Copyright © 2025 Pieter Bonte - * KU Leuven — Stream Intelligence Lab, Belgium - * - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this file, - * you can obtain one at https://mozilla.org/MPL/2.0/. - * - * - * - * NOTE 1: We are using the benchmark dataset from: - * Waterloo SPARQL Diversity Test Suite (WatDiv) v0.6 - * Source: https://dsg.uwaterloo.ca/watdiv/ - * - * NOTE 2: Before running with the 10M-triple dataset, ensure you have: - * 1) Downloaded `watdiv.10M.nt` into a `benchmark_dataset` directory - * at the project root. - * 2) Created the `benchmark_dataset` directory next to `kolibrie/`. - * (e.g., `mkdir benchmark_dataset && mv watdiv.10M.nt benchmark_dataset/`) - * - * NOTE 3: The watdiv.10M.nt file is approximately 1.5 GB in size. - * - */ +* Copyright © 2025 Volodymyr Kadzhaia +* Copyright © 2025 Pieter Bonte +* KU Leuven — Stream Intelligence Lab, Belgium +* +* This Source Code Form is subject to the terms of the Mozilla Public +* License, v. 2.0. If a copy of the MPL was not distributed with this file, +* you can obtain one at https://mozilla.org/MPL/2.0/. +* +* +* +* NOTE 1: We are using the benchmark dataset from: +* Waterloo SPARQL Diversity Test Suite (WatDiv) v0.6 +* Source: https://dsg.uwaterloo.ca/watdiv/ +* +* NOTE 2: Before running with the 10M-triple dataset, ensure you have: +* 1) Downloaded `watdiv.10M.nt` into a `benchmark_dataset` directory +* at the project root. +* 2) Created the `benchmark_dataset` directory next to `kolibrie/`. +* (e.g., `mkdir benchmark_dataset && mv watdiv.10M.nt benchmark_dataset/`) +* +* NOTE 3: The watdiv.10M.nt file is approximately 1.5 GB in size. +* +*/ use kolibrie::execute_query::*; use kolibrie::sparql_database::*; use std::fs::File; use std::io::{BufRead, BufReader}; use std::time::Instant; +use shared::index_manager::*; + +fn make_index_from_env() -> (String, Box) { + let index_type = std::env::var("INDEX_TYPE") + .unwrap_or_else(|_| "hexastore".to_string()) + .to_lowercase(); + + let index: Box = match index_type.as_str() { + "hexastore" => Box::new(HexastoreIndex::new()), + "" => Box::new(HexastoreIndex::new()), + "spo" => Box::new(SPOSingleIndex::new()), + "pos" => Box::new(POSSingleIndex::new()), + "osp" => Box::new(OSPSingleIndex::new()), + "pso" => Box::new(PSOSingleIndex::new()), + "ops" => Box::new(OPSSingleIndex::new()), + "sop" => Box::new(SOPSingleIndex::new()), + "table" => Box::new(SingleTableIndex::new()), + other => { + eprintln!("WARNING: Unknown INDEX_TYPE '{}', falling back to hexastore.", other); + Box::new(HexastoreIndex::new()) + } + }; + + (index_type, index) +} fn parse_large_ntriples_file( - file_path: &str, + file_path: &str, ) -> Result> { - println!("Starting to parse N-Triples file: {}", file_path); - let start_time = Instant::now(); + let (index_name, index) = make_index_from_env(); + println!("INDEX_TYPE = {}", index_name); + println!("Starting to parse N-Triples file: {}", file_path); + let start_time = Instant::now(); - let mut db = SparqlDatabase::new(); + //let mut db = SparqlDatabase::new(); + //let mut db = SparqlDatabase::with_index(Box::new(SPOSingleIndex::new())); + let mut db = SparqlDatabase::with_index(index); - // Much smaller buffer and more aggressive memory management - let file = File::open(file_path)?; - let reader = BufReader::with_capacity(64 * 1024, file); // Reduced buffer size + // Much smaller buffer and more aggressive memory management + let file = File::open(file_path)?; + let reader = BufReader::with_capacity(64 * 1024, file); // Reduced buffer size - let mut line_count = 0; - let mut batch_lines = Vec::new(); - const BATCH_SIZE: usize = 10_000; // Much smaller batch size + let mut line_count = 0; + let mut batch_lines = Vec::new(); + const BATCH_SIZE: usize = 10_000; // Much smaller batch size - for line_result in reader.lines() { - let line = line_result?; + for line_result in reader.lines() { + let line = line_result?; - if line.trim().is_empty() || line.starts_with('#') { - continue; - } + if line.trim().is_empty() || line.starts_with('#') { + continue; + } - batch_lines.push(line); - line_count += 1; + batch_lines.push(line); + line_count += 1; - if batch_lines.len() >= BATCH_SIZE { - // Process batch immediately - let batch_data = batch_lines.join("\n"); - db.parse_ntriples_and_add(&batch_data); + if batch_lines.len() >= BATCH_SIZE { + // Process batch immediately + let batch_data = batch_lines.join("\n"); + db.parse_ntriples_and_add(&batch_data); - // Aggressive cleanup - batch_lines.clear(); - batch_lines.shrink_to_fit(); + // Aggressive cleanup + batch_lines.clear(); + batch_lines.shrink_to_fit(); - // Progress info every 100k triples - if line_count % 100_000 == 0 { - println!("Processed {} triples", line_count); - std::hint::black_box(()); + // Progress info every 100k triples + if line_count % 100_000 == 0 { + println!("Processed {} triples", line_count); + std::hint::black_box(()); - // Optional: small delay to let the system breathe - std::thread::sleep(std::time::Duration::from_millis(10)); - } - } + // Optional: small delay to let the system breathe + std::thread::sleep(std::time::Duration::from_millis(10)); + } } + } - // Process remaining batch - if !batch_lines.is_empty() { - let batch_data = batch_lines.join("\n"); - db.parse_ntriples_and_add(&batch_data); - } - db.get_or_build_stats(); + // Process remaining batch + if !batch_lines.is_empty() { + let batch_data = batch_lines.join("\n"); + db.parse_ntriples_and_add(&batch_data); + } + db.get_or_build_stats(); - println!( - "Finished parsing {} triples in {:.2} seconds", - line_count, - start_time.elapsed().as_secs_f64() - ); + println!( + "Finished parsing {} triples in {:.2} seconds", + line_count, + start_time.elapsed().as_secs_f64() + ); - // Build indexes after parsing - this is where the magic happens - println!("Building indexes..."); - let index_start = Instant::now(); - db.build_all_indexes(); - println!("Indexes built in {:.2} seconds", index_start.elapsed().as_secs_f64()); + // Build indexes after parsing - this is where the magic happens + println!("Building indexes..."); + let index_start = Instant::now(); + db.build_all_indexes(); + println!("Indexes built in {:.2} seconds", index_start.elapsed().as_secs_f64()); - Ok(db) + Ok(db) } fn run_all_queries(db: &mut SparqlDatabase) { - const ITERATIONS: usize = 20; + const ITERATIONS: usize = 3; - // (name, query) - let queries: &[(&str, &str)] = &[ - // C1 - ( - "C1", - r#"PREFIX wsdbm: + // (name, query) + let queries: &[(&str, &str)] = &[ + // C1 + ( + "C1", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -128,11 +157,11 @@ fn run_all_queries(db: &mut SparqlDatabase) { ?v7 sorg:language ?v8 . } "#, - ), - // C2 - ( - "C2", - r#"PREFIX wsdbm: + ), + // C2 + ( + "C2", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -156,11 +185,11 @@ fn run_all_queries(db: &mut SparqlDatabase) { ?v8 rev:totalVotes ?v9 . } "#, - ), - // C3 - ( - "C3", - r#"PREFIX wsdbm: + ), + // C3 + ( + "C3", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -181,11 +210,11 @@ fn run_all_queries(db: &mut SparqlDatabase) { ?v0 foaf:givenName ?v6 . } "#, - ), - // F1 - ( - "F1", - r#"PREFIX wsdbm: + ), + // F1 + ( + "F1", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -205,11 +234,11 @@ fn run_all_queries(db: &mut SparqlDatabase) { ?v3 rdf:type wsdbm:ProductCategory2 . } "#, - ), - // F2 - ( - "F2", - r#"PREFIX wsdbm: + ), + // F2 + ( + "F2", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -231,11 +260,11 @@ fn run_all_queries(db: &mut SparqlDatabase) { ?v0 wsdbm:hasGenre . } "#, - ), - // F3 - ( - "F3", - r#"PREFIX wsdbm: + ), + // F3 + ( + "F3", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -255,11 +284,11 @@ fn run_all_queries(db: &mut SparqlDatabase) { ?v5 wsdbm:purchaseFor ?v0 . } "#, - ), - // F4 - ( - "F4", - r#"PREFIX wsdbm: + ), + // F4 + ( + "F4", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -282,11 +311,11 @@ fn run_all_queries(db: &mut SparqlDatabase) { ?v7 wsdbm:likes ?v0 . } "#, - ), - // F5 - ( - "F5", - r#"PREFIX wsdbm: + ), + // F5 + ( + "F5", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -306,11 +335,11 @@ fn run_all_queries(db: &mut SparqlDatabase) { ?v1 rdf:type ?v6 . } "#, - ), - // L1 - ( - "L1", - r#"PREFIX wsdbm: + ), + // L1 + ( + "L1", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -327,11 +356,11 @@ fn run_all_queries(db: &mut SparqlDatabase) { ?v0 wsdbm:likes ?v2 . } "#, - ), - // L2 - ( - "L2", - r#"PREFIX wsdbm: + ), + // L2 + ( + "L2", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -348,11 +377,11 @@ fn run_all_queries(db: &mut SparqlDatabase) { ?v2 sorg:nationality ?v1 . } "#, - ), - // L3 - ( - "L3", - r#"PREFIX wsdbm: + ), + // L3 + ( + "L3", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -368,11 +397,11 @@ fn run_all_queries(db: &mut SparqlDatabase) { ?v0 wsdbm:subscribes . } "#, - ), - // L4 - ( - "L4", - r#"PREFIX wsdbm: + ), + // L4 + ( + "L4", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -388,11 +417,11 @@ fn run_all_queries(db: &mut SparqlDatabase) { ?v0 ?v2 . } "#, - ), - // L5 - ( - "L5", - r#"PREFIX wsdbm: + ), + // L5 + ( + "L5", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -409,11 +438,11 @@ fn run_all_queries(db: &mut SparqlDatabase) { ?v0 sorg:nationality ?v3 . } "#, - ), - // S1 - ( - "S1", - r#"PREFIX wsdbm: + ), + // S1 + ( + "S1", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -436,11 +465,11 @@ fn run_all_queries(db: &mut SparqlDatabase) { ?v0 sorg:priceValidUntil ?v9 . } "#, - ), - // S2 - ( - "S2", - r#"PREFIX wsdbm: + ), + // S2 + ( + "S2", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -458,11 +487,11 @@ fn run_all_queries(db: &mut SparqlDatabase) { ?v0 rdf:type wsdbm:Role2 . } "#, - ), - // S3 - ( - "S3", - r#"PREFIX wsdbm: + ), + // S3 + ( + "S3", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -480,11 +509,11 @@ fn run_all_queries(db: &mut SparqlDatabase) { ?v0 sorg:publisher ?v4 . } "#, - ), - // S4 - ( - "S4", - r#"PREFIX wsdbm: + ), + // S4 + ( + "S4", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -502,11 +531,11 @@ fn run_all_queries(db: &mut SparqlDatabase) { ?v0 sorg:nationality wsdbm:Country1 . } "#, - ), - // S5 - ( - "S5", - r#"PREFIX wsdbm: + ), + // S5 + ( + "S5", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -524,11 +553,11 @@ fn run_all_queries(db: &mut SparqlDatabase) { ?v0 sorg:language wsdbm:Language0 . } "#, - ), - // S6 - ( - "S6", - r#"PREFIX wsdbm: + ), + // S6 + ( + "S6", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -545,11 +574,11 @@ fn run_all_queries(db: &mut SparqlDatabase) { ?v0 wsdbm:hasGenre . } "#, - ), - // S7 - ( - "S7", - r#"PREFIX wsdbm: + ), + // S7 + ( + "S7", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -566,46 +595,46 @@ fn run_all_queries(db: &mut SparqlDatabase) { wsdbm:likes ?v0 . } "#, - ), - ]; - - for (name, query) in queries.iter() { - println!("=============================================="); - println!("Running query {} ({} iterations)...", name, ITERATIONS); + ), + ]; - let mut total_time = 0.0; - // let mut last_result:Vec> = Vec::new(); + for (name, query) in queries.iter() { + println!("=============================================="); + println!("Running query {} ({} iterations)...", name, ITERATIONS); - for _ in 0..ITERATIONS { - let start = Instant::now(); - let _ = execute_query_rayon_parallel2_volcano(query, db); - let elapsed = start.elapsed().as_secs_f64(); - total_time += elapsed; - } + let mut total_time = 0.0; + // let mut last_result:Vec> = Vec::new(); - let avg = total_time / (ITERATIONS as f64); - println!("Average time for {}: {:.6} seconds", name, avg); + for _ in 0..ITERATIONS { + let start = Instant::now(); + let _ = execute_query_rayon_parallel2_volcano(query, db); + let elapsed = start.elapsed().as_secs_f64(); + total_time += elapsed; } + + let avg = total_time / (ITERATIONS as f64); + println!("Average time for {}: {:.6} seconds", name, avg); + } } fn main() { - // Set current directory to the root of the project - std::env::set_current_dir(std::path::Path::new(env!("CARGO_MANIFEST_DIR"))) + // Set current directory to the root of the project + std::env::set_current_dir(std::path::Path::new(env!("CARGO_MANIFEST_DIR"))) .expect("Failed to set project root as current directory"); - let file_path = "../benchmark_dataset/watdiv.10M.nt"; + let file_path = "../benchmark_dataset/watdiv.10M.nt"; - match parse_large_ntriples_file(file_path) { - Ok(mut db) => { - println!("Successfully processed N-Triples file"); - run_all_queries(&mut db); - } - Err(e) => { - eprintln!("Error processing file '{}': {}", file_path, e); - println!( - "File not found or error occurred. \ -Make sure ../benchmark_dataset/watdiv.10M.nt exists." - ); - } + match parse_large_ntriples_file(file_path) { + Ok(mut db) => { + println!("Successfully processed N-Triples file"); + run_all_queries(&mut db); + } + Err(e) => { + eprintln!("Error processing file '{}': {}", file_path, e); + println!( + "File not found or error occurred. \ + Make sure ../benchmark_dataset/watdiv.10M.nt exists." + ); } + } } diff --git a/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh new file mode 100755 index 0000000..d038223 --- /dev/null +++ b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash +# +# run_all_indexes.sh — Run the n_triple_10M benchmark for every index type +# and save all output to a specified directory. +# +# Usage: +# ./run_all_indexes.sh [output_dir] +# +# If output_dir is not specified, defaults to ./benchmark_results +# + +set -euo pipefail + +OUTPUT_DIR="${1:-./benchmark_results}" +TIMESTAMP=$(date +%Y%m%d_%H%M%S) +RESULT_DIR="${OUTPUT_DIR}/${TIMESTAMP}" + +INDEX_TYPES=( + "table" +) + +echo "==============================================" +echo " Kolibrie Index Benchmark Runner" +echo "==============================================" +echo "Output directory: ${RESULT_DIR}" +echo "Index types: ${INDEX_TYPES[*]}" +echo "==============================================" + +mkdir -p "${RESULT_DIR}" + +echo "" +echo "[BUILD] Compiling in release mode..." +cargo build --release --example n_triple_10M 2>&1 | tee "${RESULT_DIR}/build.log" +echo "[BUILD] Done." +echo "" + +for idx_type in "${INDEX_TYPES[@]}"; do + OUTPUT_FILE="${RESULT_DIR}/${idx_type}.txt" + + echo "==============================================" + echo "[RUN] INDEX_TYPE=${idx_type}" + echo " Output: ${OUTPUT_FILE}" + echo "==============================================" + + INDEX_TYPE="${idx_type}" \ + cargo run --release --example n_triple_10M \ + 2>&1 | tee "${OUTPUT_FILE}" + + echo "" + echo "[DONE] ${idx_type} -> ${OUTPUT_FILE}" + echo "" +done + +echo "==============================================" +echo " All benchmarks complete!" +echo " Results in: ${RESULT_DIR}" +echo "==============================================" diff --git a/shared/src/index_manager/dynamic_hexastore.rs b/shared/src/index_manager/dynamic_hexastore.rs new file mode 100644 index 0000000..0df27ea --- /dev/null +++ b/shared/src/index_manager/dynamic_hexastore.rs @@ -0,0 +1,49 @@ +//hexastore, but only builds indexes if they would be used +//uses heuristic to determine when index is valuable +//Step 1: dynamic hexastore is initialized with an array of access patterns represented as Triples +//(with bound and or unbound variables, eg: (s, p, ?o) could be determined with sp scan or ps scan) +//Step 2: dynamic hexastore chooses initial necessary indexes, eg if all triples can be solved with +//ps scan, then only having pso index is sufficient. +//Step 3: dynamic hexastore creates index pools per necessary index. And assigns an index pool to +//every access pattern (eg p and s bound, only p bound, ...) +//(An index pool is a pool of indexes with the only rule being that if you join all indexes in a +//pool, every triple in the current window must be present. We use this to switch between used +//indexes dynamically later on. +//Step 4: now we ingest data, when building the index from data or adding a data entry, we want to +//add this entry to every pool exactly once. This means every pool has at most 1 active index to +//which we insert. At first this will just be the only index present in the pool (like pso in our +//example). +//Step 5: every set time interval we also check a heuristic to determine if we should switch the +//index used in a pool. To do this, we use the number of unique subjects, predicates and objects +//from the existing stats implementation which has the cardinalities of each of these. +//The heuristic works as follows: We determine a cost as an overhead value (which is static and set +//at build time) + the amount of hashset/map lookups. You can guess the amount of hashset/map +//lookups as follows: if you scan for example an spo index for a certain predicate, you will do #s +//lookups to find the s number of po maps. If you scan for a bound subject (predicate and object +//not bound) on this same index, you will only have to do 1 lookup to find the po map that has all +//entries. Having only a bound object, will necessitate a full table scan: first find every po map +//(#s lookups), then find every o set, (#p lookups or #s*#p lookups in total), then for every set +//check if the bound object is present (#s*#p*1 total lookups) We then also guess the cost of +//maintaining an index as follows: MAP_OVERHEAD * (1 + #s) + SET_OVERHEAD * (#s * #p) + SPACE_OVERHEAD * (#s * #p * #o) +//Then we find the set of indexes to maintain that minimizes cost = sum of cost heurists for each +//access pattern passed during initialisation + sum of cost heuristics for every index in that set +//There will already be existing indexes, since we at first initialize these based on which access +//patterns we have exclusively (not taking into account data cardinalities, since we dont know that +//yet at initialisation). To successfully transition from the old indexes without having to do an +//expensive complete copy operation between the indexes, we do the following: +//1) Create a new pool for every index in the new index set +//2) For every access pattern, use the heuristics to find the best pool to assign it to. +//3) For every new pool, find the old pool that creates the lowest total cost across all access +// patterns assigned to the new pool. +//4) Any unassigned old pools are deleted. +//5) Of the old pool(s) that were assigned to a new pool, check if the desired index of the new +// pool is present in the old pool (doesnt matter if active or not). +// - If so: we dont have to create any new indexes for that pool, and the old pool just becomes +// the new pool, only (possibly) changing the active index. +// - If not: create a new index of the type that we want for the new pool, set that to the +// active index, and maintain the indexes of the old pool in the new pool to maintain the data +//6) In the data deletion function, that deletes a triple from the triplestore, (attempt to) delete +// the triple from every index in every pool. Also check for every index if the index is empty +// after this. If so, and it is not the active index, delete that index from the pool. +// By adding to only one index per pool and deleting unilaterally, eventually, unless we keep +// switching active indexes, each pool will converge to have one index. diff --git a/shared/src/index_manager/mod.rs b/shared/src/index_manager/mod.rs index 34abc54..f68c683 100644 --- a/shared/src/index_manager/mod.rs +++ b/shared/src/index_manager/mod.rs @@ -20,6 +20,7 @@ pub use pos_single::POSSingleIndex; pub use pso_single::PSOSingleIndex; pub use sop_single::SOPSingleIndex; pub use spo_single::SPOSingleIndex; +pub use single_table::SingleTableIndex; pub mod hexastore; pub mod ops_single; pub mod osp_single; @@ -27,6 +28,7 @@ pub mod pos_single; pub mod pso_single; pub mod sop_single; pub mod spo_single; +pub mod single_table; /// Describes which access patterns an index can serve efficiently. diff --git a/shared/src/index_manager/ops_single.rs b/shared/src/index_manager/ops_single.rs index 93b5fc4..f9e9904 100644 --- a/shared/src/index_manager/ops_single.rs +++ b/shared/src/index_manager/ops_single.rs @@ -37,7 +37,7 @@ impl TripleIndex for OPSSingleIndex { let Triple { subject: s, predicate: p, object: o } = *triple; if let Some(pred_map) = self.ops.get(&o) { if let Some(subjects) = pred_map.get(&p) { - if objects.contains(&s) { + if subjects.contains(&s) { return false; // triple already stored } } @@ -86,7 +86,7 @@ impl TripleIndex for OPSSingleIndex { let num_threads = rayon::current_num_threads(); let chunk_size = (triples.len() / num_threads).max(10_000); - let partial_indexes: Vec = triples + let partial_indexes: Vec = triples .par_chunks(chunk_size) .map(|chunk| { let mut local_index = OPSSingleIndex::new(); @@ -160,7 +160,7 @@ impl TripleIndex for OPSSingleIndex { } // (S, -, -) (Some(ss), None, None) => { - for (&obj, pred_map) in self.ops { + for (&obj, pred_map) in &self.ops { for (&pred, subjects) in pred_map { if subjects.contains(&ss) { results.push( Triple { subject: ss, predicate: pred, object: obj }); @@ -170,7 +170,7 @@ impl TripleIndex for OPSSingleIndex { } // (-, P, -) (None, Some(pp), None) => { - for (&obj, pred_map) in self.ops { + for (&obj, pred_map) in &self.ops { if let Some(subjects) = pred_map.get(&pp) { for &subj in subjects { results.push(Triple { subject: subj, predicate: pp, object: obj }); @@ -190,7 +190,7 @@ impl TripleIndex for OPSSingleIndex { } // (-, -, -) => all (None, None, None) => { - for (&obj, pred_map) in self.ops { + for (&obj, pred_map) in &self.ops { for (&pred, subjects) in pred_map { for &subj in subjects { results.push(Triple { subject: subj, predicate: pred, object: obj }); @@ -256,7 +256,7 @@ impl TripleIndex for OPSSingleIndex { fn scan_op(&self, o: u32, p: u32) -> Option<&HashSet> { self.ops .get(&o) - .and_then(|pred_map| pred_map.get(&p)); + .and_then(|pred_map| pred_map.get(&p)) } fn optimize(&mut self) { diff --git a/shared/src/index_manager/single_table.rs b/shared/src/index_manager/single_table.rs new file mode 100644 index 0000000..0980116 --- /dev/null +++ b/shared/src/index_manager/single_table.rs @@ -0,0 +1,149 @@ +use serde::{Serialize, Deserialize}; +use std::collections::HashSet; +use crate::terms::*; +use crate::triple::Triple; +use crate::index_manager::*; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SingleTableIndex { + pub table: HashSet, +} + +impl TripleIndex for SingleTableIndex { + fn clone_empty(&self) -> Box { + Box::new(SingleTableIndex::new()) + } + + fn clone_box(&self) -> Box { + Box::new(self.clone()) + } + + fn triple_count(&self) -> usize { + self.table.len() + } + + fn supported_access_patterns(&self) -> AccessPatternSupport { + // no specialized access pattern supported + AccessPatternSupport { sp: false, so: false, po: false, ps: false, os: false, op: false } + } + + fn insert(&mut self, triple: &Triple) -> bool { + // Insert returns true only when the triple was not present before. + self.table.insert(triple.clone()) + } + + fn delete(&mut self, triple: &Triple) -> bool { + // HashSet::remove accepts &T and returns whether an element was removed. + self.table.remove(triple) + } + + fn build_from_triples(&mut self, triples: &[Triple]) { + // simple replace-with-new-set strategy; keeps semantics consistent. + use rayon::prelude::*; + + self.clear(); + + if triples.is_empty() { + return; + } + + // Reserve a reasonable capacity (heuristic) + self.table.reserve(triples.len()); + + // If rayon is available, we can build partial sets and merge them. + let num_threads = rayon::current_num_threads(); + if triples.len() >= 10_000 && num_threads > 1 { + // parallel build + let chunk_size = (triples.len() / num_threads).max(1_000); + let partials: Vec> = triples + .par_chunks(chunk_size) + .map(|chunk| { + let mut local = HashSet::with_capacity(chunk.len()); + for t in chunk { + local.insert(t.clone()); + } + local + }) + .collect(); + + for part in partials { + self.table.extend(part); + } + } else { + // serial build + self.table.extend(triples.iter().cloned()); + } + + self.optimize_post_build(); + } + + fn query(&self, s: Option, p: Option, o: Option) -> Vec { + // brute-force scan across the table; acceptable because this index has no sub-indexes + let mut results = Vec::new(); + + for triple in &self.table { + let matches = match (s, p, o) { + (Some(ss), Some(pp), Some(oo)) => triple.subject == ss && triple.predicate == pp && triple.object == oo, + (Some(ss), Some(pp), None) => triple.subject == ss && triple.predicate == pp, + (Some(ss), None, Some(oo)) => triple.subject == ss && triple.object == oo, + (None, Some(pp), Some(oo)) => triple.predicate == pp && triple.object == oo, + (Some(ss), None, None) => triple.subject == ss, + (None, Some(pp), None) => triple.predicate == pp, + (None, None, Some(oo)) => triple.object == oo, + (None, None, None) => true, + }; + + if matches { + results.push(triple.clone()); + } + } + + results + } + + fn get_matching_triples(&self, pattern: &TriplePattern) -> Vec { + let (s, p, o) = pattern; + let sub = match s { Constant(x) => Some(*x), Variable(_) => None }; + let pre = match p { Constant(x) => Some(*x), Variable(_) => None }; + let obj = match o { Constant(x) => Some(*x), Variable(_) => None }; + self.query(sub, pre, obj) + } + + fn clear(&mut self) { + self.table.clear(); + } + + // no specialized scans possible; all return None + fn scan_sp(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_so(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_po(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_ps(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_os(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_op(&self, _: u32, _: u32) -> Option<&HashSet> { None } + + fn optimize(&mut self) { + // nothing complex to optimize here — just shrink to fit + self.table.shrink_to_fit(); + } +} +impl SingleTableIndex { + pub fn new() -> Self { + Self { table: HashSet::new() } + } + + /// merge another NoIndex into this one + pub fn merge_from(&mut self, other: SingleTableIndex) { + self.table.extend(other.table); + } + + /// optimized single-triple insert used during parallel builds + #[inline] + fn insert_optimized(&mut self, triple: &Triple) -> bool { + // returns true if inserted, false if already present + self.table.insert(triple.clone()) + } + + fn optimize_post_build(&mut self) { + self.table.shrink_to_fit(); + } +} From 3f5745dd971a8366a1202e3c71e68d6fc9f5d4fc Mon Sep 17 00:00:00 2001 From: Mirovh Date: Wed, 4 Mar 2026 23:16:22 +0100 Subject: [PATCH 09/23] Implement dynamic hexastore --- .../n_triples_data/n_triple_10M.rs | 50 +- .../n_triple_10M_all_indexes.sh | 3 +- kolibrie/src/parser.rs | 1 - kolibrie/src/query_engine.rs | 7 +- kolibrie/src/sparql_database.rs | 6394 +++++++++-------- kolibrie/src/storage_manager.rs | 10 +- .../execution/engine.rs | 22 +- shared/src/index_manager/dynamic_hexastore.rs | 1037 +++ shared/src/index_manager/mod.rs | 43 + 9 files changed, 4370 insertions(+), 3197 deletions(-) diff --git a/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M.rs b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M.rs index 49695bd..d1e2f3d 100644 --- a/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M.rs +++ b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M.rs @@ -30,41 +30,47 @@ use std::io::{BufRead, BufReader}; use std::time::Instant; use shared::index_manager::*; -fn make_index_from_env() -> (String, Box) { - let index_type = std::env::var("INDEX_TYPE") - .unwrap_or_else(|_| "hexastore".to_string()) - .to_lowercase(); +fn make_config_from_env() -> (String, IndexConfig) { + let index_type = std::env::var("INDEX_TYPE") + .unwrap_or_else(|_| "hexastore".to_string()) + .to_lowercase(); - let index: Box = match index_type.as_str() { - "hexastore" => Box::new(HexastoreIndex::new()), - "" => Box::new(HexastoreIndex::new()), - "spo" => Box::new(SPOSingleIndex::new()), - "pos" => Box::new(POSSingleIndex::new()), - "osp" => Box::new(OSPSingleIndex::new()), - "pso" => Box::new(PSOSingleIndex::new()), - "ops" => Box::new(OPSSingleIndex::new()), - "sop" => Box::new(SOPSingleIndex::new()), - "table" => Box::new(SingleTableIndex::new()), - other => { - eprintln!("WARNING: Unknown INDEX_TYPE '{}', falling back to hexastore.", other); - Box::new(HexastoreIndex::new()) - } - }; + let config = match index_type.as_str() { + "hexastore" | "" => IndexConfig::Hexastore, + "spo" => IndexConfig::SPO, + "pos" => IndexConfig::POS, + "osp" => IndexConfig::OSP, + "pso" => IndexConfig::PSO, + "ops" => IndexConfig::OPS, + "sop" => IndexConfig::SOP, + "table" => IndexConfig::SingleTable, + "dynamic" => IndexConfig::DynamicHexastore { + eval_interval: 1000, + queries: vec![], // or read from another env var + }, + other => { + eprintln!( + "WARNING: Unknown INDEX_TYPE '{}', falling back to hexastore.", + other + ); + IndexConfig::Hexastore + } + }; - (index_type, index) + (index_type, config) } fn parse_large_ntriples_file( file_path: &str, ) -> Result> { - let (index_name, index) = make_index_from_env(); + let (index_name, config) = make_config_from_env(); println!("INDEX_TYPE = {}", index_name); println!("Starting to parse N-Triples file: {}", file_path); let start_time = Instant::now(); //let mut db = SparqlDatabase::new(); //let mut db = SparqlDatabase::with_index(Box::new(SPOSingleIndex::new())); - let mut db = SparqlDatabase::with_index(index); + let mut db = SparqlDatabase::with_config(config); // Much smaller buffer and more aggressive memory management let file = File::open(file_path)?; diff --git a/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh index d038223..93e5087 100755 --- a/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh +++ b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh @@ -16,7 +16,8 @@ TIMESTAMP=$(date +%Y%m%d_%H%M%S) RESULT_DIR="${OUTPUT_DIR}/${TIMESTAMP}" INDEX_TYPES=( - "table" + "dynamic_hexastore", + "hexastore" ) echo "==============================================" diff --git a/kolibrie/src/parser.rs b/kolibrie/src/parser.rs index 9825368..548453b 100644 --- a/kolibrie/src/parser.rs +++ b/kolibrie/src/parser.rs @@ -27,7 +27,6 @@ use shared::rule::FilterCondition; use shared::rule::Rule; use shared::terms::*; use shared::query::*; -use shared::index_manager::TripleIndex; // Add RSP imports use crate::rsp::s2r::{CSPARQLWindow, Report, ReportStrategy, Tick, WindowTriple, ContentContainer}; use crate::rsp::r2s::{Relation2StreamOperator, StreamOperator}; diff --git a/kolibrie/src/query_engine.rs b/kolibrie/src/query_engine.rs index 4fe9865..4897671 100644 --- a/kolibrie/src/query_engine.rs +++ b/kolibrie/src/query_engine.rs @@ -8,7 +8,6 @@ * you can obtain one at https://mozilla.org/MPL/2.0/. */ -use shared::index_manager::TripleIndex; use crate::storage_manager::{StorageManager, StorageBackend, StorageStats}; use crate::storage_trait::{StorageTrait, StorageMode, QueryAnalyzer}; use crate::disk_storage::lsm_tree::LSMConfig; @@ -113,7 +112,7 @@ impl QueryEngine { // Extract the encoded triples let triples = self.storage_manager.get_memory_database() - .index_manager + .index_manager.as_ref().expect("Cannot query index before building it") .query(None, None, None); // Insert into LSM-Tree @@ -122,7 +121,7 @@ impl QueryEngine { // Clear memory database self.storage_manager.get_memory_database_mut().triples.clear(); self.storage_manager.get_memory_database_mut().index_manager = - Box::new(shared::index_manager::HexastoreIndex::new()); + Some(Box::new(shared::index_manager::HexastoreIndex::new())); // Build statistics self.storage_manager.get_memory_database_mut().get_or_build_stats(); @@ -357,4 +356,4 @@ mod tests { assert!(explanation.will_use_volcano); assert!(!explanation.has_windowing); } -} \ No newline at end of file +} diff --git a/kolibrie/src/sparql_database.rs b/kolibrie/src/sparql_database.rs index e6213f5..aa3994b 100644 --- a/kolibrie/src/sparql_database.rs +++ b/kolibrie/src/sparql_database.rs @@ -1,12 +1,12 @@ /* - * Copyright © 2025 Volodymyr Kadzhaia - * Copyright © 2025 Pieter Bonte - * KU Leuven — Stream Intelligence Lab, Belgium - * - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this file, - * you can obtain one at https://mozilla.org/MPL/2.0/. - */ +* Copyright © 2025 Volodymyr Kadzhaia +* Copyright © 2025 Pieter Bonte +* KU Leuven — Stream Intelligence Lab, Belgium +* +* This Source Code Form is subject to the terms of the Mozilla Public +* License, v. 2.0. If a copy of the MPL was not distributed with this file, +* you can obtain one at https://mozilla.org/MPL/2.0/. +*/ use shared::dictionary::Dictionary; use crate::sliding_window::SlidingWindow; @@ -37,6 +37,12 @@ use std::sync::Arc; use std::sync::{Mutex, RwLock}; use url::Url; use crate::streamertail_optimizer::DatabaseStats; +use shared::index_manager::{ + IndexConfig, HexastoreIndex, SPOSingleIndex, POSSingleIndex, OSPSingleIndex, + PSOSingleIndex, OPSSingleIndex, SOPSingleIndex, SingleTableIndex, DynamicHexastoreIndex +}; +use shared::terms::TriplePattern; +use crate::parser::convert_triple_pattern; const MIN_CHUNK_SIZE: usize = 1024; const HASHMAP_INITIAL_CAPACITY: usize = 4096; @@ -46,3321 +52,3403 @@ const HASHMAP_INITIAL_CAPACITY1: usize = 1024; #[derive(Debug, Clone)] pub struct SparqlDatabase { - pub triples: BTreeSet, - pub streams: Vec, - pub sliding_window: Option, - pub dictionary: Arc>, - pub prefixes: HashMap, - pub udfs: HashMap, - pub index_manager: Box, - pub rule_map: HashMap, - pub cached_stats: Option>, + pub triples: BTreeSet, + pub streams: Vec, + pub sliding_window: Option, + pub dictionary: Arc>, + pub prefixes: HashMap, + pub udfs: HashMap, + pub index_manager: Option>, + pub rule_map: HashMap, + pub cached_stats: Option>, + index_config: IndexConfig, } #[allow(dead_code)] impl SparqlDatabase { - pub fn new() -> Self { - Self::with_index(Box::new(shared::index_manager::HexastoreIndex::new())) - } - - /// Creates a new database with a user-chosen indexing strategy. - pub fn with_index(index: Box) -> Self { - Self { - triples: BTreeSet::new(), - streams: Vec::new(), - sliding_window: None, - dictionary: Arc::new(RwLock::new(Dictionary::new())), - prefixes: HashMap::new(), - udfs: HashMap::new(), - index_manager: index, - rule_map: HashMap::new(), - cached_stats: None, - } - } - - pub fn set_prefixes(&mut self, prefixes: HashMap){ - self.prefixes=prefixes; - } - - pub fn get_or_build_stats(&mut self) -> Arc { - if let Some(stats) = &self.cached_stats { - return stats.clone(); // ← Clone the Arc (cheap), not the DatabaseStats - } - - let stats = Arc::new(DatabaseStats::gather_stats_fast(self)); - self.cached_stats = Some(stats.clone()); - stats - } - - pub fn invalidate_stats_cache(&mut self) { - self.cached_stats = None; - } - - pub fn query(&self) -> QueryBuilder<'_> { - QueryBuilder::new(self) - } - - pub fn add_triple(&mut self, triple: Triple) { - self.triples.insert(triple.clone()); - self.index_manager.insert(&triple); - } - - pub fn delete_triple(&mut self, triple: &Triple) -> bool { - let removed = self.triples.remove(triple); - if removed { - self.index_manager.delete(triple); + pub fn new() -> Self { + Self::with_config(IndexConfig::Hexastore) + } + + /// Creates a new database with a user-chosen indexing strategy. + pub fn with_config(config: IndexConfig) -> Self { + Self { + triples: BTreeSet::new(), + streams: Vec::new(), + sliding_window: None, + dictionary: Arc::new(RwLock::new(Dictionary::new())), + prefixes: HashMap::new(), + udfs: HashMap::new(), + index_manager: None, + rule_map: HashMap::new(), + cached_stats: None, + index_config: config, + } + } + pub fn set_prefixes(&mut self, prefixes: HashMap){ + self.prefixes=prefixes; + } + + fn make_initial_index(config: &IndexConfig) -> Box { + match config { + IndexConfig::Hexastore => Box::new(HexastoreIndex::new()), + IndexConfig::SPO => Box::new(SPOSingleIndex::new()), + IndexConfig::POS => Box::new(POSSingleIndex::new()), + IndexConfig::OSP => Box::new(OSPSingleIndex::new()), + IndexConfig::PSO => Box::new(PSOSingleIndex::new()), + IndexConfig::OPS => Box::new(OPSSingleIndex::new()), + IndexConfig::SOP => Box::new(SOPSingleIndex::new()), + IndexConfig::SingleTable => Box::new(SingleTableIndex::new()), + // Pattern-dependent indexes start as hexastore; + // `build_all_indexes` will swap them out. + IndexConfig::DynamicHexastore { .. } => Box::new(HexastoreIndex::new()), + } + } + + fn resolve_query_patterns(&self, raw_queries: &[String]) -> Vec { + let mut patterns = Vec::new(); + + for query_str in raw_queries { + // parse_sparql_query returns a big tuple; field index 2 + // is the Vec of raw (&str, &str, &str) triple patterns, + // field index 5 is the HashMap of prefixes. + if let Ok((_rest, parsed)) = crate::parser::parse_sparql_query(query_str) { + let raw_patterns = parsed.2; // Vec<(&str, &str, &str)> + let query_prefixes = parsed.5; // HashMap + + // Merge query prefixes with database prefixes + let mut all_prefixes = self.prefixes.clone(); + for (k, v) in query_prefixes { + all_prefixes.insert(k, v); } - removed - } - /// Helper function that accepts parts of a triple, constructs a Triple, and adds it - pub fn add_triple_parts(&mut self, subject: &str, predicate: &str, object: &str) { let mut dict = self.dictionary.write().unwrap(); - let subject_id = dict.encode(subject); - let predicate_id = dict.encode(predicate); - let object_id = dict.encode(object); - drop(dict); + for triple in raw_patterns { + patterns.push(convert_triple_pattern(triple, &mut dict, &all_prefixes)); + } + } + } + + patterns + } + + pub fn build_all_indexes(&mut self) { + let triples: Vec = self.triples.iter().cloned().collect(); + + let mut index: Box = match &self.index_config { + IndexConfig::Hexastore => Box::new(HexastoreIndex::new()), + IndexConfig::SPO => Box::new(SPOSingleIndex::new()), + IndexConfig::POS => Box::new(POSSingleIndex::new()), + IndexConfig::OSP => Box::new(OSPSingleIndex::new()), + IndexConfig::PSO => Box::new(PSOSingleIndex::new()), + IndexConfig::OPS => Box::new(OPSSingleIndex::new()), + IndexConfig::SOP => Box::new(SOPSingleIndex::new()), + IndexConfig::SingleTable => Box::new(SingleTableIndex::new()), + + IndexConfig::DynamicHexastore { eval_interval, queries } => { + let patterns = self.resolve_query_patterns(queries); + let eval = *eval_interval as usize; + Box::new(DynamicHexastoreIndex::new(patterns, eval)) + } + + // Future index types go here: + // IndexConfig::YourNewIndex { some_param, queries } => { + // let patterns = self.resolve_query_patterns(queries); + // Box::new(YourNewIndex::new(patterns, *some_param)) + // } + }; + + index.build_from_triples(&triples); + index.optimize(); + self.index_manager = Some(index); + } + + /// Get a reference to the index. + /// Panics if `build_all_indexes()` hasn't been called yet. + pub fn index(&self) -> &dyn TripleIndex { + self.index_manager + .as_deref() + .expect("index not built — call build_all_indexes() first") + } + + /// Get a mutable reference to the index. + /// Panics if `build_all_indexes()` hasn't been called yet. + pub fn index_mut(&mut self) -> &mut dyn TripleIndex { + self.index_manager + .as_deref_mut() + .expect("index not built — call build_all_indexes() first") + } + + pub fn get_or_build_stats(&mut self) -> Arc { + if let Some(stats) = &self.cached_stats { + return stats.clone(); // ← Clone the Arc (cheap), not the DatabaseStats + } + + let stats = Arc::new(DatabaseStats::gather_stats_fast(self)); + self.cached_stats = Some(stats.clone()); + stats + } + + pub fn invalidate_stats_cache(&mut self) { + self.cached_stats = None; + } + + pub fn query(&self) -> QueryBuilder<'_> { + QueryBuilder::new(self) + } + + pub fn add_triple(&mut self, triple: Triple) { + self.triples.insert(triple.clone()); + if let Some(ref mut idx) = self.index_manager { + idx.insert(&triple); + } + } + + pub fn delete_triple(&mut self, triple: &Triple) -> bool { + let removed = self.triples.remove(triple); + if removed { + if let Some(ref mut idx) = self.index_manager { + idx.delete(triple); + } + } + removed + } + + /// Helper function that accepts parts of a triple, constructs a Triple, and adds it + pub fn add_triple_parts(&mut self, subject: &str, predicate: &str, object: &str) { + let mut dict = self.dictionary.write().unwrap(); + let subject_id = dict.encode(subject); + let predicate_id = dict.encode(predicate); + let object_id = dict.encode(object); + drop(dict); + + let triple = Triple { + subject: subject_id, + predicate: predicate_id, + object: object_id, + }; + self.add_triple(triple); + } + + /// Helper function that accepts parts of a triple, constructs a Triple, and deletes it + pub fn delete_triple_parts(&mut self, subject: &str, predicate: &str, object: &str) -> bool { + let mut dict = self.dictionary.write().unwrap(); + let subject_id = dict.encode(subject); + let predicate_id = dict.encode(predicate); + let object_id = dict.encode(object); + drop(dict); + + let triple = Triple { + subject: subject_id, + predicate: predicate_id, + object: object_id, + }; + self.delete_triple(&triple) + } + + pub fn generate_rdf_xml(&mut self) -> String { + let mut xml = String::new(); + xml.push_str("\n"); + xml.push_str("\n"); + + // Group triples by subject + let dict = self.dictionary.read().unwrap(); + let mut subjects: BTreeMap> = BTreeMap::new(); + for triple in &self.triples { + let subject = dict.decode(triple.subject); + let predicate = dict.decode(triple.predicate); + let object = dict.decode(triple.object); + subjects.entry(subject.unwrap().to_string()).or_default().push((predicate.unwrap().to_string(), object.unwrap().to_string())); + } + drop(dict); + + // For each subject, create an element. + for (subject, po_pairs) in subjects { + xml.push_str(&format!(" \n", subject)); + for (predicate, object) in po_pairs { + xml.push_str(&format!(" <{}>{}\n", predicate, object, predicate)); + } + xml.push_str(" \n"); + } + + xml.push_str("\n"); + xml + } + + pub fn parse_rdf(&mut self, rdf_xml: &str) { + let mut reader = Reader::from_str(rdf_xml); + + let mut current_subject = Vec::with_capacity(128); + let mut current_predicate = Vec::with_capacity(128); + + let (sender, receiver) = unbounded::>(); + let dictionary = Arc::clone(&self.dictionary); + let triples_set = Arc::new(Mutex::new(Vec::new())); + let num_threads = utils::get_num_cpus(); + + // Crossbeam scope to manage threads + scope(|s| { + // Spawn worker threads + for _ in 0..num_threads { + let receiver = receiver.clone(); + let triples_set = Arc::clone(&triples_set); + s.spawn(move |_| { + while let Ok(chunk) = receiver.recv() { + if chunk.is_empty() { + // Termination signal + break; + } + + // Process chunk using Rayon + let local_triples: BTreeSet = + chunk.into_par_iter().map(|triple| triple).collect(); + + // Insert into shared triples set + let mut triples = triples_set.lock().unwrap(); + triples.push(local_triples); + } + }); + } + + // Parsing and sending chunks + let mut triples = Vec::with_capacity(8192); + loop { + match reader.read_event() { + Ok(Event::Start(ref e)) => match e.name() { + QName(b"rdf:RDF") => { + for attr in e.attributes().filter_map(Result::ok) { + let key = attr.key; + let value = attr.value; + if key.as_ref().starts_with(b"xmlns:") { + let prefix = std::str::from_utf8(&key.as_ref()[6..]) + .unwrap_or("") + .to_string(); + let uri = std::str::from_utf8(&value).unwrap_or("").to_string(); + self.prefixes.insert(prefix, uri); + } else if key.as_ref() == b"xmlns" { + // Default namespace + let uri = std::str::from_utf8(&value).unwrap_or("").to_string(); + self.prefixes.insert("".to_string(), uri); + } + } + } + QName(b"rdf:Description") => { + for attr in e.attributes().filter_map(Result::ok) { + if attr.key == QName(b"rdf:about") { + current_subject.truncate(0); + current_subject.extend_from_slice(&attr.value); + } + } + } + QName(b"rdfs:Class") | QName(b"rdf:type") => { + current_predicate.truncate(0); + current_predicate.extend_from_slice(b"rdf:type"); + } + QName(b"rdfs:subClassOf") => { + current_predicate.truncate(0); + current_predicate.extend_from_slice(b"rdfs:subClassOf"); + } + QName(b"rdfs:label") => { + current_predicate.truncate(0); + current_predicate.extend_from_slice(b"rdfs:label"); + } + name => { + let name_str = + std::str::from_utf8(name.as_ref()).unwrap_or("").to_string(); + let resolved_predicate = self.resolve_term(&name_str); + current_predicate = resolved_predicate.clone().into_bytes(); + } + }, + Ok(Event::Empty(ref e)) => { + if let Ok(predicate) = std::str::from_utf8(e.name().as_ref()) { + let resolved_predicate = self.resolve_term(predicate); + let mut object = Vec::with_capacity(128); + for attr in e.attributes().filter_map(Result::ok) { + if attr.key == QName(b"rdf:resource") { + object.extend_from_slice(&attr.value); + } + } + if !object.is_empty() { + if let (Ok(subject_str), Ok(object_str)) = ( + std::str::from_utf8(¤t_subject), + std::str::from_utf8(&object), + ) { + // Lock the dictionary for encoding + let mut dict = dictionary.write().unwrap(); + let triple = Triple { + subject: dict.encode(subject_str), + predicate: dict.encode(&resolved_predicate), + object: dict.encode(object_str), + }; + drop(dict); // Release the lock + triples.push(triple); + } + } + } + } + Ok(Event::Text(e)) => { + // Use Reader's decode method and trim whitespace + if let Ok(object_str) = reader.decoder().decode(e.as_ref()) { + let trimmed_object = object_str.trim(); + // Skip empty or whitespace-only text + if !trimmed_object.is_empty() { + if let Ok(subject_str) = std::str::from_utf8(¤t_subject) { + if let Ok(predicate_str) = std::str::from_utf8(¤t_predicate) { + let resolved_predicate = self.resolve_term(predicate_str); + // Lock the dictionary for encoding + let mut dict = dictionary.write().unwrap(); + let triple = Triple { + subject: dict.encode(subject_str), + predicate: dict.encode(&resolved_predicate), + object: dict.encode(trimmed_object), + }; + drop(dict); // Release the lock + triples.push(triple); + } + } + } + } + } + Ok(Event::End(ref e)) => { + if e.name() == QName(b"rdf:Description") { + current_subject.truncate(0); + current_predicate.truncate(0); + } + } + Ok(Event::Eof) => break, + Err(e) => { + eprintln!("Error reading XML: {:?}", e); + break; + } + _ => {} + } + + if triples.len() >= 8192 { + sender.send(triples).unwrap(); + triples = Vec::with_capacity(8192); + } + } + + if !triples.is_empty() { + sender.send(triples).unwrap(); + } + + // Send termination signals + for _ in 0..num_threads { + sender.send(Vec::new()).unwrap(); + } + }) + .unwrap(); + + // Merge all BTreeSets into the main triples set + let triples_sets = Arc::try_unwrap(triples_set).unwrap().into_inner().unwrap(); + for local_triples in triples_sets { + self.triples.extend(local_triples); + } + } + + pub fn parse_rdf_from_file(&mut self, filename: &str) { + let file = std::fs::File::open(filename).expect("Cannot open file"); + let reader = std::io::BufReader::new(file); + let mut xml_reader = Reader::from_reader(reader); + + let mut current_subject = Vec::with_capacity(128); + let mut current_predicate = Vec::with_capacity(128); + + // First, read prefixes before spawning worker threads + let mut buf = Vec::new(); + loop { + match xml_reader.read_event_into(&mut buf) { + Ok(Event::Start(ref e)) => { + if e.name() == QName(b"rdf:RDF") { + // Read prefixes + for attr in e.attributes().filter_map(Result::ok) { + let key = attr.key; + let value = attr.value; + if key.as_ref().starts_with(b"xmlns:") { + let prefix = std::str::from_utf8(&key.as_ref()[6..]) + .unwrap_or("") + .to_string(); + let uri = std::str::from_utf8(&value).unwrap_or("").to_string(); + self.prefixes.insert(prefix, uri); + } else if key.as_ref() == b"xmlns" { + // Default namespace + let uri = std::str::from_utf8(&value).unwrap_or("").to_string(); + self.prefixes.insert("".to_string(), uri); + } + } + break; // We have read the prefixes, proceed to the rest + } + } + Ok(Event::Eof) => { + eprintln!("Reached EOF before reading prefixes."); + break; + } + Err(e) => { + eprintln!("Error reading XML: {:?}", e); + break; + } + _ => {} + } + buf.clear(); + } + + // Continue reading and parsing the rest of the file + let mut triples = Vec::with_capacity(8192); + loop { + match xml_reader.read_event_into(&mut buf) { + Ok(Event::Start(ref e)) => match e.name() { + QName(b"rdf:Description") => { + for attr in e.attributes().filter_map(Result::ok) { + if attr.key == QName(b"rdf:about") { + current_subject.clear(); + current_subject.extend_from_slice(&attr.value); + } + } + } + QName(b"rdfs:Class") | QName(b"rdf:type") => { + current_predicate.clear(); + current_predicate.extend_from_slice(b"rdf:type"); + } + QName(b"rdfs:subClassOf") => { + current_predicate.clear(); + current_predicate.extend_from_slice(b"rdfs:subClassOf"); + } + QName(b"rdfs:label") => { + current_predicate.clear(); + current_predicate.extend_from_slice(b"rdfs:label"); + } + name => { + let name_str = std::str::from_utf8(name.as_ref()).unwrap_or("").to_string(); + let resolved_predicate = self.resolve_term(&name_str); + current_predicate = resolved_predicate.clone().into_bytes(); + } + }, + Ok(Event::Empty(ref e)) => { + if let Ok(predicate) = std::str::from_utf8(e.name().as_ref()) { + let resolved_predicate = self.resolve_term(predicate); + let mut object = Vec::with_capacity(128); + for attr in e.attributes().filter_map(Result::ok) { + if attr.key == QName(b"rdf:resource") { + object.extend_from_slice(&attr.value); + } + } + if !object.is_empty() { + if let (Ok(subject_str), Ok(object_str)) = ( + std::str::from_utf8(¤t_subject), + std::str::from_utf8(&object), + ) { + let mut dict = self.dictionary.write().unwrap(); + let triple = Triple { + subject: dict.encode(subject_str), + predicate: dict.encode(&resolved_predicate), + object: dict.encode(object_str), + }; + drop(dict); + triples.push(triple); + } + } + } + } + Ok(Event::Text(e)) => { + // Use Reader's decode method and trim whitespace + if let Ok(object_str) = xml_reader.decoder().decode(e.as_ref()) { + let trimmed_object = object_str.trim(); + // Skip empty or whitespace-only text + if !trimmed_object.is_empty() { + if let Ok(subject_str) = std::str::from_utf8(¤t_subject) { + if let Ok(predicate_str) = std::str::from_utf8(¤t_predicate) { + let resolved_predicate = self.resolve_term(predicate_str); + let mut dict = self.dictionary.write().unwrap(); + let triple = Triple { + subject: dict.encode(subject_str), + predicate: dict.encode(&resolved_predicate), + object: dict.encode(trimmed_object), + }; + drop(dict); + triples.push(triple); + } + } + } + } + } + Ok(Event::End(ref e)) => { + if e.name() == QName(b"rdf:Description") { + current_subject.clear(); + current_predicate.clear(); + } + } + Ok(Event::Eof) => break, + Err(e) => { + eprintln!("Error reading XML: {:?}", e); + break; + } + _ => {} + } + + buf.clear(); + + if triples.len() >= 8192 { + // Process triples in parallel using Rayon + let local_triples: BTreeSet = triples.into_par_iter().collect(); + self.triples.extend(local_triples); + triples = Vec::with_capacity(8192); + } + } + + if !triples.is_empty() { + let local_triples: BTreeSet = triples.into_par_iter().collect(); + self.triples.extend(local_triples); + } + } + + // New parse_turtle function + pub fn parse_turtle(&mut self, turtle_data: &str) { + let lines = turtle_data.lines(); + + for line in lines { + let line = line.trim(); + + // Skip empty lines and comments + if line.is_empty() || line.starts_with("#") { + continue; + } + + // Parse triples + let parts: Vec<&str> = line.split_whitespace().collect(); + if parts.len() >= 3 { + let subject_raw = parts[0].trim_end_matches('.'); + let predicate_raw = parts[1].trim_end_matches('.'); + let object_raw = parts[2..].join(" ").trim_end_matches('.').to_string(); + + // Strip angle brackets from IRIs + let subject = if subject_raw.starts_with('<') && subject_raw.ends_with('>') { + subject_raw[1..subject_raw.len()-1].to_string() + } else { + subject_raw.to_string() + }; - let triple = Triple { - subject: subject_id, - predicate: predicate_id, - object: object_id, + let predicate = if predicate_raw.starts_with('<') && predicate_raw.ends_with('>') { + predicate_raw[1..predicate_raw.len()-1].to_string() + } else { + predicate_raw.to_string() }; - self.add_triple(triple); - } - /// Helper function that accepts parts of a triple, constructs a Triple, and deletes it - pub fn delete_triple_parts(&mut self, subject: &str, predicate: &str, object: &str) -> bool { - let mut dict = self.dictionary.write().unwrap(); - let subject_id = dict.encode(subject); - let predicate_id = dict.encode(predicate); - let object_id = dict.encode(object); - drop(dict); + // Clean up object by removing quotes and angle brackets + let object = if object_raw.starts_with('<') && object_raw.ends_with('>') { + object_raw[1..object_raw.len()-1].to_string() + } else if object_raw.starts_with('"') && object_raw.ends_with('"') { + object_raw[1..object_raw.len()-1].to_string() + } else { + object_raw.trim().trim_matches('"').to_string() + }; + let mut dict = self.dictionary.write().unwrap(); let triple = Triple { - subject: subject_id, - predicate: predicate_id, - object: object_id, + subject: dict.encode(&subject), + predicate: dict.encode(&predicate), + object: dict.encode(&object), }; - self.delete_triple(&triple) - } - - pub fn generate_rdf_xml(&mut self) -> String { - let mut xml = String::new(); - xml.push_str("\n"); - xml.push_str("\n"); - - // Group triples by subject - let dict = self.dictionary.read().unwrap(); - let mut subjects: BTreeMap> = BTreeMap::new(); - for triple in &self.triples { - let subject = dict.decode(triple.subject); - let predicate = dict.decode(triple.predicate); - let object = dict.decode(triple.object); - subjects.entry(subject.unwrap().to_string()).or_default().push((predicate.unwrap().to_string(), object.unwrap().to_string())); - } drop(dict); - - // For each subject, create an element. - for (subject, po_pairs) in subjects { - xml.push_str(&format!(" \n", subject)); - for (predicate, object) in po_pairs { - xml.push_str(&format!(" <{}>{}\n", predicate, object, predicate)); - } - xml.push_str(" \n"); - } - - xml.push_str("\n"); - xml - } - - pub fn parse_rdf(&mut self, rdf_xml: &str) { - let mut reader = Reader::from_str(rdf_xml); - - let mut current_subject = Vec::with_capacity(128); - let mut current_predicate = Vec::with_capacity(128); - - let (sender, receiver) = unbounded::>(); - let dictionary = Arc::clone(&self.dictionary); - let triples_set = Arc::new(Mutex::new(Vec::new())); - let num_threads = utils::get_num_cpus(); - - // Crossbeam scope to manage threads - scope(|s| { - // Spawn worker threads - for _ in 0..num_threads { - let receiver = receiver.clone(); - let triples_set = Arc::clone(&triples_set); - s.spawn(move |_| { - while let Ok(chunk) = receiver.recv() { - if chunk.is_empty() { - // Termination signal - break; - } - - // Process chunk using Rayon - let local_triples: BTreeSet = - chunk.into_par_iter().map(|triple| triple).collect(); - - // Insert into shared triples set - let mut triples = triples_set.lock().unwrap(); - triples.push(local_triples); - } - }); - } - - // Parsing and sending chunks - let mut triples = Vec::with_capacity(8192); - loop { - match reader.read_event() { - Ok(Event::Start(ref e)) => match e.name() { - QName(b"rdf:RDF") => { - for attr in e.attributes().filter_map(Result::ok) { - let key = attr.key; - let value = attr.value; - if key.as_ref().starts_with(b"xmlns:") { - let prefix = std::str::from_utf8(&key.as_ref()[6..]) - .unwrap_or("") - .to_string(); - let uri = std::str::from_utf8(&value).unwrap_or("").to_string(); - self.prefixes.insert(prefix, uri); - } else if key.as_ref() == b"xmlns" { - // Default namespace - let uri = std::str::from_utf8(&value).unwrap_or("").to_string(); - self.prefixes.insert("".to_string(), uri); - } - } - } - QName(b"rdf:Description") => { - for attr in e.attributes().filter_map(Result::ok) { - if attr.key == QName(b"rdf:about") { - current_subject.truncate(0); - current_subject.extend_from_slice(&attr.value); - } - } - } - QName(b"rdfs:Class") | QName(b"rdf:type") => { - current_predicate.truncate(0); - current_predicate.extend_from_slice(b"rdf:type"); - } - QName(b"rdfs:subClassOf") => { - current_predicate.truncate(0); - current_predicate.extend_from_slice(b"rdfs:subClassOf"); - } - QName(b"rdfs:label") => { - current_predicate.truncate(0); - current_predicate.extend_from_slice(b"rdfs:label"); - } - name => { - let name_str = - std::str::from_utf8(name.as_ref()).unwrap_or("").to_string(); - let resolved_predicate = self.resolve_term(&name_str); - current_predicate = resolved_predicate.clone().into_bytes(); - } - }, - Ok(Event::Empty(ref e)) => { - if let Ok(predicate) = std::str::from_utf8(e.name().as_ref()) { - let resolved_predicate = self.resolve_term(predicate); - let mut object = Vec::with_capacity(128); - for attr in e.attributes().filter_map(Result::ok) { - if attr.key == QName(b"rdf:resource") { - object.extend_from_slice(&attr.value); - } - } - if !object.is_empty() { - if let (Ok(subject_str), Ok(object_str)) = ( - std::str::from_utf8(¤t_subject), - std::str::from_utf8(&object), - ) { - // Lock the dictionary for encoding - let mut dict = dictionary.write().unwrap(); - let triple = Triple { - subject: dict.encode(subject_str), - predicate: dict.encode(&resolved_predicate), - object: dict.encode(object_str), - }; - drop(dict); // Release the lock - triples.push(triple); - } - } - } - } - Ok(Event::Text(e)) => { - // Use Reader's decode method and trim whitespace - if let Ok(object_str) = reader.decoder().decode(e.as_ref()) { - let trimmed_object = object_str.trim(); - // Skip empty or whitespace-only text - if !trimmed_object.is_empty() { - if let Ok(subject_str) = std::str::from_utf8(¤t_subject) { - if let Ok(predicate_str) = std::str::from_utf8(¤t_predicate) { - let resolved_predicate = self.resolve_term(predicate_str); - // Lock the dictionary for encoding - let mut dict = dictionary.write().unwrap(); - let triple = Triple { - subject: dict.encode(subject_str), - predicate: dict.encode(&resolved_predicate), - object: dict.encode(trimmed_object), - }; - drop(dict); // Release the lock - triples.push(triple); - } - } - } - } - } - Ok(Event::End(ref e)) => { - if e.name() == QName(b"rdf:Description") { - current_subject.truncate(0); - current_predicate.truncate(0); - } - } - Ok(Event::Eof) => break, - Err(e) => { - eprintln!("Error reading XML: {:?}", e); - break; - } - _ => {} - } - - if triples.len() >= 8192 { - sender.send(triples).unwrap(); - triples = Vec::with_capacity(8192); - } - } - - if !triples.is_empty() { - sender.send(triples).unwrap(); - } - - // Send termination signals - for _ in 0..num_threads { - sender.send(Vec::new()).unwrap(); - } - }) - .unwrap(); - - // Merge all BTreeSets into the main triples set - let triples_sets = Arc::try_unwrap(triples_set).unwrap().into_inner().unwrap(); - for local_triples in triples_sets { - self.triples.extend(local_triples); - } - } - - pub fn parse_rdf_from_file(&mut self, filename: &str) { - let file = std::fs::File::open(filename).expect("Cannot open file"); - let reader = std::io::BufReader::new(file); - let mut xml_reader = Reader::from_reader(reader); - - let mut current_subject = Vec::with_capacity(128); - let mut current_predicate = Vec::with_capacity(128); - - // First, read prefixes before spawning worker threads - let mut buf = Vec::new(); - loop { - match xml_reader.read_event_into(&mut buf) { - Ok(Event::Start(ref e)) => { - if e.name() == QName(b"rdf:RDF") { - // Read prefixes - for attr in e.attributes().filter_map(Result::ok) { - let key = attr.key; - let value = attr.value; - if key.as_ref().starts_with(b"xmlns:") { - let prefix = std::str::from_utf8(&key.as_ref()[6..]) - .unwrap_or("") - .to_string(); - let uri = std::str::from_utf8(&value).unwrap_or("").to_string(); - self.prefixes.insert(prefix, uri); - } else if key.as_ref() == b"xmlns" { - // Default namespace - let uri = std::str::from_utf8(&value).unwrap_or("").to_string(); - self.prefixes.insert("".to_string(), uri); - } - } - break; // We have read the prefixes, proceed to the rest - } - } - Ok(Event::Eof) => { - eprintln!("Reached EOF before reading prefixes."); - break; - } - Err(e) => { - eprintln!("Error reading XML: {:?}", e); - break; - } - _ => {} - } - buf.clear(); - } - - // Continue reading and parsing the rest of the file - let mut triples = Vec::with_capacity(8192); - loop { - match xml_reader.read_event_into(&mut buf) { - Ok(Event::Start(ref e)) => match e.name() { - QName(b"rdf:Description") => { - for attr in e.attributes().filter_map(Result::ok) { - if attr.key == QName(b"rdf:about") { - current_subject.clear(); - current_subject.extend_from_slice(&attr.value); - } - } - } - QName(b"rdfs:Class") | QName(b"rdf:type") => { - current_predicate.clear(); - current_predicate.extend_from_slice(b"rdf:type"); - } - QName(b"rdfs:subClassOf") => { - current_predicate.clear(); - current_predicate.extend_from_slice(b"rdfs:subClassOf"); - } - QName(b"rdfs:label") => { - current_predicate.clear(); - current_predicate.extend_from_slice(b"rdfs:label"); - } - name => { - let name_str = std::str::from_utf8(name.as_ref()).unwrap_or("").to_string(); - let resolved_predicate = self.resolve_term(&name_str); - current_predicate = resolved_predicate.clone().into_bytes(); - } - }, - Ok(Event::Empty(ref e)) => { - if let Ok(predicate) = std::str::from_utf8(e.name().as_ref()) { - let resolved_predicate = self.resolve_term(predicate); - let mut object = Vec::with_capacity(128); - for attr in e.attributes().filter_map(Result::ok) { - if attr.key == QName(b"rdf:resource") { - object.extend_from_slice(&attr.value); - } - } - if !object.is_empty() { - if let (Ok(subject_str), Ok(object_str)) = ( - std::str::from_utf8(¤t_subject), - std::str::from_utf8(&object), - ) { - let mut dict = self.dictionary.write().unwrap(); - let triple = Triple { - subject: dict.encode(subject_str), - predicate: dict.encode(&resolved_predicate), - object: dict.encode(object_str), - }; - drop(dict); - triples.push(triple); - } - } - } - } - Ok(Event::Text(e)) => { - // Use Reader's decode method and trim whitespace - if let Ok(object_str) = xml_reader.decoder().decode(e.as_ref()) { - let trimmed_object = object_str.trim(); - // Skip empty or whitespace-only text - if !trimmed_object.is_empty() { - if let Ok(subject_str) = std::str::from_utf8(¤t_subject) { - if let Ok(predicate_str) = std::str::from_utf8(¤t_predicate) { - let resolved_predicate = self.resolve_term(predicate_str); - let mut dict = self.dictionary.write().unwrap(); - let triple = Triple { - subject: dict.encode(subject_str), - predicate: dict.encode(&resolved_predicate), - object: dict.encode(trimmed_object), - }; - drop(dict); - triples.push(triple); - } - } - } - } - } - Ok(Event::End(ref e)) => { - if e.name() == QName(b"rdf:Description") { - current_subject.clear(); - current_predicate.clear(); - } - } - Ok(Event::Eof) => break, - Err(e) => { - eprintln!("Error reading XML: {:?}", e); - break; - } - _ => {} - } + self.triples.insert(triple); + } else { + eprintln!("Skipping invalid line: {}", line); + } + } + } + + // New parse_n3 function + pub fn parse_n3(&mut self, n3_data: &str) { + let lines: Vec = n3_data.lines().map(|l| l.trim().to_string()).collect(); + let chunk_size = 1000; + let chunks: Vec> = lines + .chunks(chunk_size) + .map(|c| c.to_vec()) + .collect(); + + let partial_results: Vec<(BTreeSet, Arc>, HashMap)> = + chunks.par_iter().map(|chunk| { + let mut local_db = SparqlDatabase::new(); + let mut statement = String::new(); + + for raw_line in chunk { + let mut line = raw_line.as_str(); + if let Some(comment_start) = line.find('#') { + line = &line[..comment_start]; + line = line.trim(); + } + if line.is_empty() { + continue; + } + if line.starts_with("@prefix") { + let line = line.trim_start_matches("@prefix").trim_end_matches('.'); + let parts: Vec<&str> = line.split_whitespace().collect(); + if parts.len() >= 2 { + let prefix = parts[0].trim_end_matches(':').to_string(); + let uri = parts[1].trim_start_matches('<').trim_end_matches('>').to_string(); + local_db.prefixes.insert(prefix, uri); + } else { + eprintln!("Invalid prefix declaration: {}", line); + } + } else { + statement.push_str(line); + statement.push(' '); + if line.ends_with('.') { + local_db.parse_statement(statement.trim()); + statement.clear(); + } + } + } + + (local_db.triples, local_db.dictionary, local_db.prefixes) + }).collect(); + + for (triples, dict_arc, pref) in partial_results { + for t in triples { + self.triples.insert(t); + } + let mut self_dict = self.dictionary.write().unwrap(); + let other_dict = dict_arc.read().unwrap(); + self_dict.merge(&other_dict); + drop(other_dict); + drop(self_dict); + for (k, v) in pref { + self.prefixes.insert(k, v); + } + } + } + + // Parse_ntriples and add to DB function + pub fn parse_ntriples_and_add(&mut self, ntriples_data: &str) { + let partial_results = self.parse_ntriples(ntriples_data); + + let encoded_triples = self.encode_triples(partial_results); + for encoded_triple in encoded_triples{ + self.add_triple(encoded_triple); + } + } + + // Parses ntriples + pub fn parse_ntriples(&mut self, ntriples_data: &str) -> Vec> { + let lines: Vec<&str> = ntriples_data.lines().collect(); + let chunk_size = 1000; + let chunks: Vec<&[&str]> = lines.chunks(chunk_size).collect(); + + let partial_results: Vec> = chunks + .par_iter() + .map(|chunk| { + let mut local_triples = Vec::new(); + + for line in chunk.iter() { + let line = line.trim(); + + // Skip empty lines and comments + if line.is_empty() || line.starts_with('#') { + continue; + } - buf.clear(); + // N-Triples must end with a dot + if !line.ends_with('.') { + eprintln!("Invalid N-Triples line (missing dot): {}", line); + continue; + } - if triples.len() >= 8192 { - // Process triples in parallel using Rayon - let local_triples: BTreeSet = triples.into_par_iter().collect(); - self.triples.extend(local_triples); - triples = Vec::with_capacity(8192); - } - } + // Remove the trailing dot + let line_without_dot = &line[..line.len() - 1].trim(); - if !triples.is_empty() { - let local_triples: BTreeSet = triples.into_par_iter().collect(); - self.triples.extend(local_triples); + // Parse the triple + if let Some((subject, predicate, object)) = self.parse_ntriples_line(line_without_dot) { + local_triples.push((subject, predicate, object)); + } } - } - - // New parse_turtle function - pub fn parse_turtle(&mut self, turtle_data: &str) { - let lines = turtle_data.lines(); - - for line in lines { - let line = line.trim(); - - // Skip empty lines and comments - if line.is_empty() || line.starts_with("#") { - continue; - } - - // Parse triples - let parts: Vec<&str> = line.split_whitespace().collect(); - if parts.len() >= 3 { - let subject_raw = parts[0].trim_end_matches('.'); - let predicate_raw = parts[1].trim_end_matches('.'); - let object_raw = parts[2..].join(" ").trim_end_matches('.').to_string(); - // Strip angle brackets from IRIs - let subject = if subject_raw.starts_with('<') && subject_raw.ends_with('>') { - subject_raw[1..subject_raw.len()-1].to_string() - } else { - subject_raw.to_string() - }; - - let predicate = if predicate_raw.starts_with('<') && predicate_raw.ends_with('>') { - predicate_raw[1..predicate_raw.len()-1].to_string() - } else { - predicate_raw.to_string() - }; - - // Clean up object by removing quotes and angle brackets - let object = if object_raw.starts_with('<') && object_raw.ends_with('>') { - object_raw[1..object_raw.len()-1].to_string() - } else if object_raw.starts_with('"') && object_raw.ends_with('"') { - object_raw[1..object_raw.len()-1].to_string() - } else { - object_raw.trim().trim_matches('"').to_string() - }; - - let mut dict = self.dictionary.write().unwrap(); - let triple = Triple { - subject: dict.encode(&subject), - predicate: dict.encode(&predicate), - object: dict.encode(&object), - }; - drop(dict); - self.triples.insert(triple); - } else { - eprintln!("Skipping invalid line: {}", line); - } - } - } + local_triples + }) + .collect(); + partial_results + } - // New parse_n3 function - pub fn parse_n3(&mut self, n3_data: &str) { - let lines: Vec = n3_data.lines().map(|l| l.trim().to_string()).collect(); - let chunk_size = 1000; - let chunks: Vec> = lines - .chunks(chunk_size) - .map(|c| c.to_vec()) - .collect(); - - let partial_results: Vec<(BTreeSet, Arc>, HashMap)> = - chunks.par_iter().map(|chunk| { - let mut local_db = SparqlDatabase::new(); - let mut statement = String::new(); - - for raw_line in chunk { - let mut line = raw_line.as_str(); - if let Some(comment_start) = line.find('#') { - line = &line[..comment_start]; - line = line.trim(); - } - if line.is_empty() { - continue; - } - if line.starts_with("@prefix") { - let line = line.trim_start_matches("@prefix").trim_end_matches('.'); - let parts: Vec<&str> = line.split_whitespace().collect(); - if parts.len() >= 2 { - let prefix = parts[0].trim_end_matches(':').to_string(); - let uri = parts[1].trim_start_matches('<').trim_end_matches('>').to_string(); - local_db.prefixes.insert(prefix, uri); + // Encode triples + pub fn encode_triples(&mut self, non_encoded_triples: Vec>) -> Vec{ + // Merge results with main dictionary + let mut encoded_triples = Vec::new(); + for triple_strings in non_encoded_triples { + for (subject, predicate, object) in triple_strings { + let mut dict = self.dictionary.write().unwrap(); + let main_triple = Triple { + subject: dict.encode(&subject), + predicate: dict.encode(&predicate), + object: dict.encode(&object), + }; + drop(dict); + encoded_triples.push(main_triple); + } + } + encoded_triples + } + + pub fn parse_and_encode_ntriples(&mut self, ntriples_data: &str) -> Vec{ + let partial_results = self.parse_ntriples(ntriples_data); + + self.encode_triples(partial_results) + } + + // Helper method to parse a single N-Triples line + fn parse_ntriples_line(&self, line: &str) -> Option<(String, String, String)> { + let mut parts = Vec::new(); + let mut current_part = String::new(); + let mut in_uri = false; + let mut in_literal = false; + let mut escaped = false; + let mut chars = line.chars().peekable(); + + while let Some(ch) = chars.next() { + match ch { + '<' if !in_literal && !escaped => { + in_uri = true; + current_part.push(ch); + } + '>' if in_uri && !escaped => { + in_uri = false; + current_part.push(ch); + parts.push(current_part.trim().to_string()); + current_part.clear(); + } + '"' if !in_uri && !escaped => { + in_literal = !in_literal; + current_part.push(ch); + if !in_literal { + // Check for datatype or language tag after closing quote + while let Some(&next_ch) = chars.peek() { + if next_ch == '^' || next_ch == '@' { + current_part.push(chars.next().unwrap()); + // Handle ^^ for datatypes + if next_ch == '^' { + if let Some(&second_caret) = chars.peek() { + if second_caret == '^' { + current_part.push(chars.next().unwrap()); + // Now consume the datatype URI + while let Some(&datatype_ch) = chars.peek() { + if datatype_ch == '<' { + // Start of datatype URI + current_part.push(chars.next().unwrap()); + let mut in_datatype_uri = true; + while let Some(&uri_ch) = chars.peek() { + current_part.push(chars.next().unwrap()); + if uri_ch == '>' { + in_datatype_uri = false; + break; + } + } + if !in_datatype_uri { + break; + } + } else if datatype_ch.is_whitespace() { + break; } else { - eprintln!("Invalid prefix declaration: {}", line); + current_part.push(chars.next().unwrap()); } + } + } + } + } else if next_ch == '@' { + // Language tag + while let Some(&lang_ch) = chars.peek() { + if lang_ch.is_alphanumeric() || lang_ch == '-' { + current_part.push(chars.next().unwrap()); } else { - statement.push_str(line); - statement.push(' '); - if line.ends_with('.') { - local_db.parse_statement(statement.trim()); - statement.clear(); - } + break; } + } } - - (local_db.triples, local_db.dictionary, local_db.prefixes) - }).collect(); - - for (triples, dict_arc, pref) in partial_results { - for t in triples { - self.triples.insert(t); - } - let mut self_dict = self.dictionary.write().unwrap(); - let other_dict = dict_arc.read().unwrap(); - self_dict.merge(&other_dict); - drop(other_dict); - drop(self_dict); - for (k, v) in pref { - self.prefixes.insert(k, v); - } - } - } - - // Parse_ntriples and add to DB function - pub fn parse_ntriples_and_add(&mut self, ntriples_data: &str) { - let partial_results = self.parse_ntriples(ntriples_data); - - let encoded_triples = self.encode_triples(partial_results); - for encoded_triple in encoded_triples{ - self.add_triple(encoded_triple); - } - } - - // Parses ntriples - pub fn parse_ntriples(&mut self, ntriples_data: &str) -> Vec> { - let lines: Vec<&str> = ntriples_data.lines().collect(); - let chunk_size = 1000; - let chunks: Vec<&[&str]> = lines.chunks(chunk_size).collect(); - - let partial_results: Vec> = chunks - .par_iter() - .map(|chunk| { - let mut local_triples = Vec::new(); - - for line in chunk.iter() { - let line = line.trim(); - - // Skip empty lines and comments - if line.is_empty() || line.starts_with('#') { - continue; - } - - // N-Triples must end with a dot - if !line.ends_with('.') { - eprintln!("Invalid N-Triples line (missing dot): {}", line); - continue; - } - - // Remove the trailing dot - let line_without_dot = &line[..line.len() - 1].trim(); - - // Parse the triple - if let Some((subject, predicate, object)) = self.parse_ntriples_line(line_without_dot) { - local_triples.push((subject, predicate, object)); - } - } - - local_triples - }) - .collect(); - partial_results - } - - // Encode triples - pub fn encode_triples(&mut self, non_encoded_triples: Vec>) -> Vec{ - // Merge results with main dictionary - let mut encoded_triples = Vec::new(); - for triple_strings in non_encoded_triples { - for (subject, predicate, object) in triple_strings { - let mut dict = self.dictionary.write().unwrap(); - let main_triple = Triple { - subject: dict.encode(&subject), - predicate: dict.encode(&predicate), - object: dict.encode(&object), - }; - drop(dict); - encoded_triples.push(main_triple); + break; + } else if next_ch.is_whitespace() { + break; + } else { + // Unexpected character after literal + break; + } } + parts.push(current_part.trim().to_string()); + current_part.clear(); + } } - encoded_triples - } - - pub fn parse_and_encode_ntriples(&mut self, ntriples_data: &str) -> Vec{ - let partial_results = self.parse_ntriples(ntriples_data); - - self.encode_triples(partial_results) - } - - // Helper method to parse a single N-Triples line - fn parse_ntriples_line(&self, line: &str) -> Option<(String, String, String)> { - let mut parts = Vec::new(); - let mut current_part = String::new(); - let mut in_uri = false; - let mut in_literal = false; - let mut escaped = false; - let mut chars = line.chars().peekable(); - - while let Some(ch) = chars.next() { - match ch { - '<' if !in_literal && !escaped => { - in_uri = true; - current_part.push(ch); - } - '>' if in_uri && !escaped => { - in_uri = false; - current_part.push(ch); - parts.push(current_part.trim().to_string()); - current_part.clear(); - } - '"' if !in_uri && !escaped => { - in_literal = !in_literal; - current_part.push(ch); - if !in_literal { - // Check for datatype or language tag after closing quote - while let Some(&next_ch) = chars.peek() { - if next_ch == '^' || next_ch == '@' { - current_part.push(chars.next().unwrap()); - // Handle ^^ for datatypes - if next_ch == '^' { - if let Some(&second_caret) = chars.peek() { - if second_caret == '^' { - current_part.push(chars.next().unwrap()); - // Now consume the datatype URI - while let Some(&datatype_ch) = chars.peek() { - if datatype_ch == '<' { - // Start of datatype URI - current_part.push(chars.next().unwrap()); - let mut in_datatype_uri = true; - while let Some(&uri_ch) = chars.peek() { - current_part.push(chars.next().unwrap()); - if uri_ch == '>' { - in_datatype_uri = false; - break; - } - } - if !in_datatype_uri { - break; - } - } else if datatype_ch.is_whitespace() { - break; - } else { - current_part.push(chars.next().unwrap()); - } - } - } - } - } else if next_ch == '@' { - // Language tag - while let Some(&lang_ch) = chars.peek() { - if lang_ch.is_alphanumeric() || lang_ch == '-' { - current_part.push(chars.next().unwrap()); - } else { - break; - } - } - } - break; - } else if next_ch.is_whitespace() { - break; - } else { - // Unexpected character after literal - break; - } - } - parts.push(current_part.trim().to_string()); - current_part.clear(); - } - } - '\\' if (in_uri || in_literal) && !escaped => { - escaped = true; - current_part.push(ch); - } - ' ' | '\t' if !in_uri && !in_literal && !escaped => { - if !current_part.is_empty() { - parts.push(current_part.trim().to_string()); - current_part.clear(); - } - } - _ => { - escaped = false; - current_part.push(ch); - } - } + '\\' if (in_uri || in_literal) && !escaped => { + escaped = true; + current_part.push(ch); } - - if !current_part.is_empty() { + ' ' | '\t' if !in_uri && !in_literal && !escaped => { + if !current_part.is_empty() { parts.push(current_part.trim().to_string()); - } - - if parts.len() == 3 { - let subject = self.clean_ntriples_term(&parts[0]); - // Expand the Turtle `a` shorthand for rdf:type in predicate position. - let predicate = if parts[1] == "a" { - "http://www.w3.org/1999/02/22-rdf-syntax-ns#type".to_string() - } else { - self.clean_ntriples_term(&parts[1]) + current_part.clear(); + } + } + _ => { + escaped = false; + current_part.push(ch); + } + } + } + + if !current_part.is_empty() { + parts.push(current_part.trim().to_string()); + } + + if parts.len() == 3 { + let subject = self.clean_ntriples_term(&parts[0]); + // Expand the Turtle `a` shorthand for rdf:type in predicate position. + let predicate = if parts[1] == "a" { + "http://www.w3.org/1999/02/22-rdf-syntax-ns#type".to_string() + } else { + self.clean_ntriples_term(&parts[1]) + }; + let object = self.clean_ntriples_term(&parts[2]); + Some((subject, predicate, object)) + } else { + eprintln!("Invalid N-Triples line (expected 3 parts, got {}): {}", parts.len(), line); + None + } + } + + // Helper method to clean N-Triples terms + fn clean_ntriples_term(&self, term: &str) -> String { + let term = term.trim(); + + // Handle URIs + if term.starts_with('<') && term.ends_with('>') { + return term[1..term.len()-1].to_string(); + } + + // Handle literals (keep quotes and datatype/language info) + if term.starts_with('"') { + if let Some(close_quote_pos) = term[1..].find('"') { + let close_quote_pos = close_quote_pos + 1; + let literal_value = &term[1..close_quote_pos]; + let rest = &term[close_quote_pos + 1..]; + if rest.is_empty() { + return literal_value.to_string(); + } else if rest.starts_with("^^") { + return literal_value.to_string(); + } else if rest.starts_with("@") { + return format!("{}{}", literal_value, rest); + } + } + } + + // Return as-is for other cases + term.to_string() + } + + fn parse_statement(&mut self, statement: &str) { + let mut tokens = statement.split_whitespace().peekable(); + let mut subject = String::new(); + let mut predicate = String::new(); + let mut current_state = "subject"; + + while let Some(token) = tokens.next() { + match token { + ";" => { + predicate.clear(); + current_state = "predicate"; + } + "." => { + // End of statement + break; + } + _ => match current_state { + "subject" => { + subject = token.to_string(); + current_state = "predicate"; + } + "predicate" => { + predicate = token.to_string(); + current_state = "object"; + } + "object" => { + let mut object = token.to_string(); + + // Collect tokens until we reach ';', '.', or ',' + while let Some(next_token) = tokens.peek() { + if *next_token == ";" || *next_token == "." || *next_token == "," { + break; + } + // Consume the token + let next_token = tokens.next().unwrap(); + object.push(' '); + object.push_str(next_token); + } + + // Resolve terms and store the triple + let resolved_subject = self.resolve_term(&subject); + let resolved_predicate = self.resolve_term(&predicate); + let resolved_object = self.resolve_term(&object); + + let mut dict = self.dictionary.write().unwrap(); + let triple = Triple { + subject: dict.encode(&resolved_subject), + predicate: dict.encode(&resolved_predicate), + object: dict.encode(&resolved_object), }; - let object = self.clean_ntriples_term(&parts[2]); - Some((subject, predicate, object)) - } else { - eprintln!("Invalid N-Triples line (expected 3 parts, got {}): {}", parts.len(), line); - None - } - } - - // Helper method to clean N-Triples terms - fn clean_ntriples_term(&self, term: &str) -> String { - let term = term.trim(); - - // Handle URIs - if term.starts_with('<') && term.ends_with('>') { - return term[1..term.len()-1].to_string(); - } - - // Handle literals (keep quotes and datatype/language info) - if term.starts_with('"') { - if let Some(close_quote_pos) = term[1..].find('"') { - let close_quote_pos = close_quote_pos + 1; - let literal_value = &term[1..close_quote_pos]; - let rest = &term[close_quote_pos + 1..]; - if rest.is_empty() { - return literal_value.to_string(); - } else if rest.starts_with("^^") { - return literal_value.to_string(); - } else if rest.starts_with("@") { - return format!("{}{}", literal_value, rest); - } - } + drop(dict); + self.triples.insert(triple); + + current_state = "predicate"; + } + _ => {} + }, + } + } + } + + fn resolve_term(&self, term: &str) -> String { + if term.starts_with('<') && term.ends_with('>') { + term.trim_start_matches('<') + .trim_end_matches('>') + .to_string() + } else if term.starts_with('"') { + // It's a literal, possibly with a datatype or language tag + if let Some(pos) = term.rfind('"') { + let literal = &term[..=pos]; // Include the closing quote + let rest = &term[pos + 1..]; // After the closing quote + let mut result = literal.to_string(); + if rest.starts_with("^^") { + // It's a typed literal + let datatype = rest[2..].trim(); + let resolved_datatype = self.resolve_term(datatype); + result.push_str("^^"); + result.push_str(&resolved_datatype); + } else if rest.starts_with('@') { + // It's a language-tagged literal + result.push_str(rest); } - - // Return as-is for other cases + result + } else { + // Malformed literal term.to_string() - } - - fn parse_statement(&mut self, statement: &str) { - let mut tokens = statement.split_whitespace().peekable(); - let mut subject = String::new(); - let mut predicate = String::new(); - let mut current_state = "subject"; + } + } else if term.contains(':') + && !term.starts_with("http://") + && !term.starts_with("https://") + { + let mut parts = term.splitn(2, ':'); + let prefix = parts.next().unwrap(); + let local_name = parts.next().unwrap_or(""); + if let Some(uri) = self.prefixes.get(prefix) { + format!("{}{}", uri, local_name) + } else { + eprintln!("Unknown prefix: {}", prefix); + term.to_string() + } + } else { + term.to_string() + } + } + + // Method to automatically extract and register prefixes from a query string + pub fn register_prefixes_from_query(&mut self, query: &str) { + // Simple regex to extract PREFIX declarations + let prefix_pattern = regex::Regex::new(r"PREFIX\s+([a-zA-Z0-9_]+):\s*<([^>]+)>").unwrap(); + + for captures in prefix_pattern.captures_iter(query) { + if captures.len() >= 3 { + let prefix = captures[1].to_string(); + let uri = captures[2].to_string(); + self.prefixes.insert(prefix, uri); + } + } + } + + // Method to ensure prefixes are properly shared between components + pub fn share_prefixes_with(&self, prefixes: &mut HashMap) { + for (prefix, uri) in &self.prefixes { + prefixes.insert(prefix.clone(), uri.clone()); + } + } + + pub fn resolve_query_term(&self, term: &str, prefixes: &HashMap) -> String { + if term.starts_with('<') && term.ends_with('>') { + term.trim_start_matches('<') + .trim_end_matches('>') + .to_string() + } else if term.starts_with('"') && term.ends_with('"') { + term.trim_matches('"').to_string() + } else if term.contains(':') + && !term.starts_with("http://") + && !term.starts_with("https://") + { + let mut parts = term.splitn(2, ':'); + let prefix = parts.next().unwrap(); + let local_name = parts.next().unwrap_or(""); + + // First check the passed prefixes map + if let Some(uri) = prefixes.get(prefix) { + format!("{}{}", uri, local_name) + } + // Then check the database's own prefixes map as a fallback + else if let Some(uri) = self.prefixes.get(prefix) { + format!("{}{}", uri, local_name) + } else { + eprintln!("Unknown prefix in query: {}", prefix); + term.to_string() + } + } else { + term.to_string() + } + } + + pub fn add_stream_data(&mut self, triple: Triple, timestamp: u64) { + self.streams.push(TimestampedTriple { triple, timestamp }); + } + + pub fn time_based_window(&self, start: u64, end: u64) -> BTreeSet { + self.streams + .iter() + .filter(|ts_triple| ts_triple.timestamp >= start && ts_triple.timestamp <= end) + .map(|ts_triple| ts_triple.triple.clone()) + .collect() + } + + pub fn apply_filters_simd<'a>( + &self, + results: Vec>, + filters: Vec>, + ) -> Vec> { + results + .into_iter() + .filter(|result| { + filters.iter().all(|filter_expr| { + match filter_expr { + FilterExpression::Comparison(var, operator, value) => { + // Check if either side contains arithmetic operations + let has_arithmetic = var.contains('+') || var.contains('-') || + var.contains('*') || var.contains('/') || + value.contains('+') || value.contains('-') || + value.contains('*') || value.contains('/'); + + if has_arithmetic { + // Use the non-SIMD arithmetic expression evaluator for complex expressions + let left_result = self.evaluate_arithmetic_string(result, var); + let right_result = self.evaluate_arithmetic_string(result, value); - while let Some(token) = tokens.next() { - match token { - ";" => { - predicate.clear(); - current_state = "predicate"; - } - "." => { - // End of statement - break; - } - _ => match current_state { - "subject" => { - subject = token.to_string(); - current_state = "predicate"; - } - "predicate" => { - predicate = token.to_string(); - current_state = "object"; + match (left_result, right_result) { + (Ok(left_val), Ok(right_val)) => { + // Both sides are numeric, perform comparison + match *operator { + "=" => left_val == right_val, + "!=" => left_val != right_val, + ">" => left_val > right_val, + ">=" => left_val >= right_val, + "<" => left_val < right_val, + "<=" => left_val <= right_val, + _ => false, + } + }, + _ => false // At least one expression couldn't be evaluated + } + } else { + // For simple expressions without arithmetic operators, use the SIMD approach + if let Some(var_value_str) = result.get(var) { + // First, try parsing both values as numbers + let var_value_num = var_value_str.parse::(); + let filter_value_num = value.parse::(); + + if var_value_num.is_ok() && filter_value_num.is_ok() { + // Both values are numeric, perform SIMD numeric comparison + let var_value = var_value_num.unwrap(); + let filter_value = filter_value_num.unwrap(); + + // On x86 (SSE2) or x86_64 (SSE2) use SIMD intrinsics + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + unsafe { + // Load values into SIMD registers + let var_simd = _mm_set1_epi32(var_value); + let filter_simd = _mm_set1_epi32(filter_value); + return match *operator { + "=" => _mm_movemask_epi8(_mm_cmpeq_epi32( + var_simd, + filter_simd, + )) == 0xFFFF, + "!=" => _mm_movemask_epi8(_mm_cmpeq_epi32( + var_simd, + filter_simd, + )) != 0xFFFF, + ">" => _mm_movemask_epi8(_mm_cmpgt_epi32( + var_simd, + filter_simd, + )) == 0xFFFF, + ">=" => { + let eq = _mm_cmpeq_epi32(var_simd, filter_simd); + let gt = _mm_cmpgt_epi32(var_simd, filter_simd); + _mm_movemask_epi8(_mm_or_si128(eq, gt)) == 0xFFFF + } + "<" => _mm_movemask_epi8(_mm_cmpgt_epi32( + filter_simd, + var_simd, + )) == 0xFFFF, + "<=" => { + let eq = _mm_cmpeq_epi32(var_simd, filter_simd); + let lt = _mm_cmpgt_epi32(filter_simd, var_simd); + _mm_movemask_epi8(_mm_or_si128(eq, lt)) == 0xFFFF + } + _ => false, + }; + } } - "object" => { - let mut object = token.to_string(); - // Collect tokens until we reach ';', '.', or ',' - while let Some(next_token) = tokens.peek() { - if *next_token == ";" || *next_token == "." || *next_token == "," { - break; - } - // Consume the token - let next_token = tokens.next().unwrap(); - object.push(' '); - object.push_str(next_token); + // On ARM (aarch64) use NEON intrinsics + #[cfg(target_arch = "aarch64")] + { + unsafe { + let var_neon = vdupq_n_s32(var_value); + let filter_neon = vdupq_n_s32(filter_value); + return match *operator { + "=" => { + let cmp = vceqq_s32(var_neon, filter_neon); + (vgetq_lane_u32(cmp, 0) == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 1) == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 2) == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 3) == 0xFFFFFFFF) + } + "!=" => { + let cmp = vceqq_s32(var_neon, filter_neon); + !((vgetq_lane_u32(cmp, 0) == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 1) == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 2) == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 3) == 0xFFFFFFFF)) + } + ">" => { + let cmp = vcgtq_s32(var_neon, filter_neon); + (vgetq_lane_u32(cmp, 0) == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 1) == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 2) == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 3) == 0xFFFFFFFF) + } + ">=" => { + let eq = vceqq_s32(var_neon, filter_neon); + let gt = vcgtq_s32(var_neon, filter_neon); + let cmp = vorrq_u32(eq, gt); + (vgetq_lane_u32(cmp, 0) == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 1) == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 2) == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 3) == 0xFFFFFFFF) + } + "<" => { + let cmp = vcgtq_s32(filter_neon, var_neon); + (vgetq_lane_u32(cmp, 0) == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 1) == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 2) == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 3) == 0xFFFFFFFF) + } + "<=" => { + let eq = vceqq_s32(var_neon, filter_neon); + let lt = vcgtq_s32(filter_neon, var_neon); + let cmp = vorrq_u32(eq, lt); + (vgetq_lane_u32(cmp, 0) == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 1) == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 2) == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 3) == 0xFFFFFFFF) + } + _ => false, } - - // Resolve terms and store the triple - let resolved_subject = self.resolve_term(&subject); - let resolved_predicate = self.resolve_term(&predicate); - let resolved_object = self.resolve_term(&object); - - let mut dict = self.dictionary.write().unwrap(); - let triple = Triple { - subject: dict.encode(&resolved_subject), - predicate: dict.encode(&resolved_predicate), - object: dict.encode(&resolved_object), - }; - drop(dict); - self.triples.insert(triple); - - current_state = "predicate"; + } } - _ => {} - }, - } - } - } - - fn resolve_term(&self, term: &str) -> String { - if term.starts_with('<') && term.ends_with('>') { - term.trim_start_matches('<') - .trim_end_matches('>') - .to_string() - } else if term.starts_with('"') { - // It's a literal, possibly with a datatype or language tag - if let Some(pos) = term.rfind('"') { - let literal = &term[..=pos]; // Include the closing quote - let rest = &term[pos + 1..]; // After the closing quote - let mut result = literal.to_string(); - if rest.starts_with("^^") { - // It's a typed literal - let datatype = rest[2..].trim(); - let resolved_datatype = self.resolve_term(datatype); - result.push_str("^^"); - result.push_str(&resolved_datatype); - } else if rest.starts_with('@') { - // It's a language-tagged literal - result.push_str(rest); - } - result - } else { - // Malformed literal - term.to_string() - } - } else if term.contains(':') - && !term.starts_with("http://") - && !term.starts_with("https://") - { - let mut parts = term.splitn(2, ':'); - let prefix = parts.next().unwrap(); - let local_name = parts.next().unwrap_or(""); - if let Some(uri) = self.prefixes.get(prefix) { - format!("{}{}", uri, local_name) - } else { - eprintln!("Unknown prefix: {}", prefix); - term.to_string() - } - } else { - term.to_string() - } - } - // Method to automatically extract and register prefixes from a query string - pub fn register_prefixes_from_query(&mut self, query: &str) { - // Simple regex to extract PREFIX declarations - let prefix_pattern = regex::Regex::new(r"PREFIX\s+([a-zA-Z0-9_]+):\s*<([^>]+)>").unwrap(); - - for captures in prefix_pattern.captures_iter(query) { - if captures.len() >= 3 { - let prefix = captures[1].to_string(); - let uri = captures[2].to_string(); - self.prefixes.insert(prefix, uri); - } - } - } - - // Method to ensure prefixes are properly shared between components - pub fn share_prefixes_with(&self, prefixes: &mut HashMap) { - for (prefix, uri) in &self.prefixes { - prefixes.insert(prefix.clone(), uri.clone()); - } - } - - pub fn resolve_query_term(&self, term: &str, prefixes: &HashMap) -> String { - if term.starts_with('<') && term.ends_with('>') { - term.trim_start_matches('<') - .trim_end_matches('>') - .to_string() - } else if term.starts_with('"') && term.ends_with('"') { - term.trim_matches('"').to_string() - } else if term.contains(':') - && !term.starts_with("http://") - && !term.starts_with("https://") - { - let mut parts = term.splitn(2, ':'); - let prefix = parts.next().unwrap(); - let local_name = parts.next().unwrap_or(""); - - // First check the passed prefixes map - if let Some(uri) = prefixes.get(prefix) { - format!("{}{}", uri, local_name) - } - // Then check the database's own prefixes map as a fallback - else if let Some(uri) = self.prefixes.get(prefix) { - format!("{}{}", uri, local_name) - } else { - eprintln!("Unknown prefix in query: {}", prefix); - term.to_string() - } - } else { - term.to_string() - } - } - - pub fn add_stream_data(&mut self, triple: Triple, timestamp: u64) { - self.streams.push(TimestampedTriple { triple, timestamp }); - } - - pub fn time_based_window(&self, start: u64, end: u64) -> BTreeSet { - self.streams - .iter() - .filter(|ts_triple| ts_triple.timestamp >= start && ts_triple.timestamp <= end) - .map(|ts_triple| ts_triple.triple.clone()) - .collect() - } - - pub fn apply_filters_simd<'a>( - &self, - results: Vec>, - filters: Vec>, - ) -> Vec> { - results - .into_iter() - .filter(|result| { - filters.iter().all(|filter_expr| { - match filter_expr { - FilterExpression::Comparison(var, operator, value) => { - // Check if either side contains arithmetic operations - let has_arithmetic = var.contains('+') || var.contains('-') || - var.contains('*') || var.contains('/') || - value.contains('+') || value.contains('-') || - value.contains('*') || value.contains('/'); - - if has_arithmetic { - // Use the non-SIMD arithmetic expression evaluator for complex expressions - let left_result = self.evaluate_arithmetic_string(result, var); - let right_result = self.evaluate_arithmetic_string(result, value); - - match (left_result, right_result) { - (Ok(left_val), Ok(right_val)) => { - // Both sides are numeric, perform comparison - match *operator { - "=" => left_val == right_val, - "!=" => left_val != right_val, - ">" => left_val > right_val, - ">=" => left_val >= right_val, - "<" => left_val < right_val, - "<=" => left_val <= right_val, - _ => false, - } - }, - _ => false // At least one expression couldn't be evaluated - } - } else { - // For simple expressions without arithmetic operators, use the SIMD approach - if let Some(var_value_str) = result.get(var) { - // First, try parsing both values as numbers - let var_value_num = var_value_str.parse::(); - let filter_value_num = value.parse::(); - - if var_value_num.is_ok() && filter_value_num.is_ok() { - // Both values are numeric, perform SIMD numeric comparison - let var_value = var_value_num.unwrap(); - let filter_value = filter_value_num.unwrap(); - - // On x86 (SSE2) or x86_64 (SSE2) use SIMD intrinsics - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - { - unsafe { - // Load values into SIMD registers - let var_simd = _mm_set1_epi32(var_value); - let filter_simd = _mm_set1_epi32(filter_value); - return match *operator { - "=" => _mm_movemask_epi8(_mm_cmpeq_epi32( - var_simd, - filter_simd, - )) == 0xFFFF, - "!=" => _mm_movemask_epi8(_mm_cmpeq_epi32( - var_simd, - filter_simd, - )) != 0xFFFF, - ">" => _mm_movemask_epi8(_mm_cmpgt_epi32( - var_simd, - filter_simd, - )) == 0xFFFF, - ">=" => { - let eq = _mm_cmpeq_epi32(var_simd, filter_simd); - let gt = _mm_cmpgt_epi32(var_simd, filter_simd); - _mm_movemask_epi8(_mm_or_si128(eq, gt)) == 0xFFFF - } - "<" => _mm_movemask_epi8(_mm_cmpgt_epi32( - filter_simd, - var_simd, - )) == 0xFFFF, - "<=" => { - let eq = _mm_cmpeq_epi32(var_simd, filter_simd); - let lt = _mm_cmpgt_epi32(filter_simd, var_simd); - _mm_movemask_epi8(_mm_or_si128(eq, lt)) == 0xFFFF - } - _ => false, - }; - } - } - - // On ARM (aarch64) use NEON intrinsics - #[cfg(target_arch = "aarch64")] - { - unsafe { - let var_neon = vdupq_n_s32(var_value); - let filter_neon = vdupq_n_s32(filter_value); - return match *operator { - "=" => { - let cmp = vceqq_s32(var_neon, filter_neon); - (vgetq_lane_u32(cmp, 0) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 1) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 2) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 3) == 0xFFFFFFFF) - } - "!=" => { - let cmp = vceqq_s32(var_neon, filter_neon); - !((vgetq_lane_u32(cmp, 0) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 1) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 2) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 3) == 0xFFFFFFFF)) - } - ">" => { - let cmp = vcgtq_s32(var_neon, filter_neon); - (vgetq_lane_u32(cmp, 0) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 1) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 2) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 3) == 0xFFFFFFFF) - } - ">=" => { - let eq = vceqq_s32(var_neon, filter_neon); - let gt = vcgtq_s32(var_neon, filter_neon); - let cmp = vorrq_u32(eq, gt); - (vgetq_lane_u32(cmp, 0) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 1) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 2) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 3) == 0xFFFFFFFF) - } - "<" => { - let cmp = vcgtq_s32(filter_neon, var_neon); - (vgetq_lane_u32(cmp, 0) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 1) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 2) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 3) == 0xFFFFFFFF) - } - "<=" => { - let eq = vceqq_s32(var_neon, filter_neon); - let lt = vcgtq_s32(filter_neon, var_neon); - let cmp = vorrq_u32(eq, lt); - (vgetq_lane_u32(cmp, 0) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 1) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 2) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 3) == 0xFFFFFFFF) - } - _ => false, - } - } - } - - // Fallback (or if compiled for a non‐SIMD platform) - #[cfg(not(any( - target_arch = "x86", - target_arch = "x86_64", - target_arch = "aarch64" - )))] - { - return match *operator { - "=" => var_value == filter_value, - "!=" => var_value != filter_value, - ">" => var_value > filter_value, - ">=" => var_value >= filter_value, - "<" => var_value < filter_value, - "<=" => var_value <= filter_value, - _ => false, - }; - } - } else { - // At least one value is a string, perform string comparison - let var_bytes = var_value_str.as_bytes(); - let filter_bytes = value.as_bytes(); - - let var_len = var_bytes.len(); - let filter_len = filter_bytes.len(); - - // If lengths differ, they can't be equal - if var_len != filter_len { - return match *operator { - "=" => false, - "!=" => true, - _ => false, // Other operators are not supported for strings - }; - } - - let mut i = 0; - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - { - unsafe { - while i + 16 <= var_len { - let var_chunk = _mm_loadu_si128( - var_bytes[i..].as_ptr() as *const __m128i, - ); - let filter_chunk = _mm_loadu_si128( - filter_bytes[i..].as_ptr() as *const __m128i, - ); - let cmp = _mm_cmpeq_epi8(var_chunk, filter_chunk); - let mask = _mm_movemask_epi8(cmp); - if mask != 0xFFFF { - return match *operator { - "=" => false, - "!=" => true, - _ => false, - }; - } - i += 16; - } - } - } - - #[cfg(target_arch = "aarch64")] - { - unsafe { - while i + 16 <= var_len { - let var_chunk = vld1q_u8(var_bytes[i..].as_ptr()); - let filter_chunk = vld1q_u8(filter_bytes[i..].as_ptr()); - let cmp = vceqq_u8(var_chunk, filter_chunk); - let cmp_arr: [u8; 16] = std::mem::transmute(cmp); - if cmp_arr.iter().any(|&lane| lane != 0xFF) { - return match *operator { - "=" => false, - "!=" => true, - _ => false, - }; - } - i += 16; - } - } - } - - // Handle remaining bytes - if i < var_len { - for j in i..var_len { - if var_bytes[j] != filter_bytes[j] { - return match *operator { - "=" => false, - "!=" => true, - _ => false, - }; - } - } - } - - // Strings are equal - match *operator { - "=" => true, - "!=" => false, - _ => false, // Other operators not supported for strings - } - } - } else { - false - } - } - }, - FilterExpression::And(left, right) => { - self.evaluate_filter_expression(result, left) && - self.evaluate_filter_expression(result, right) - }, - FilterExpression::Or(left, right) => { - self.evaluate_filter_expression(result, left) || - self.evaluate_filter_expression(result, right) - }, - FilterExpression::Not(expr) => { - !self.evaluate_filter_expression(result, expr) - }, - FilterExpression::ArithmeticExpr(expr_str) => { - // True if it's non-zero - match self.evaluate_arithmetic_string(result, expr_str) { - Ok(val) => val != 0.0, - Err(_) => false, - } + // Fallback (or if compiled for a non‐SIMD platform) + #[cfg(not(any( + target_arch = "x86", + target_arch = "x86_64", + target_arch = "aarch64" + )))] + { + return match *operator { + "=" => var_value == filter_value, + "!=" => var_value != filter_value, + ">" => var_value > filter_value, + ">=" => var_value >= filter_value, + "<" => var_value < filter_value, + "<=" => var_value <= filter_value, + _ => false, + }; + } + } else { + // At least one value is a string, perform string comparison + let var_bytes = var_value_str.as_bytes(); + let filter_bytes = value.as_bytes(); + + let var_len = var_bytes.len(); + let filter_len = filter_bytes.len(); + + // If lengths differ, they can't be equal + if var_len != filter_len { + return match *operator { + "=" => false, + "!=" => true, + _ => false, // Other operators are not supported for strings + }; + } + + let mut i = 0; + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + unsafe { + while i + 16 <= var_len { + let var_chunk = _mm_loadu_si128( + var_bytes[i..].as_ptr() as *const __m128i, + ); + let filter_chunk = _mm_loadu_si128( + filter_bytes[i..].as_ptr() as *const __m128i, + ); + let cmp = _mm_cmpeq_epi8(var_chunk, filter_chunk); + let mask = _mm_movemask_epi8(cmp); + if mask != 0xFFFF { + return match *operator { + "=" => false, + "!=" => true, + _ => false, + }; + } + i += 16; } + } } - }) - }) - .collect() - } - - // Helper function to evaluate an arithmetic expression - fn evaluate_arithmetic_expression<'a>( - &self, - result: &BTreeMap<&'a str, String>, - expr: &shared::query::ArithmeticExpression<'a> - ) -> Result { - match expr { - shared::query::ArithmeticExpression::Operand(operand) => { - // Check if it's a variable - if operand.starts_with('?') { - if let Some(var_value) = result.get(*operand) { - // Parse the variable value as a number - var_value.parse::().map_err(|_| format!("Cannot parse '{}' as a number", var_value)) - } else { - Err(format!("Variable '{}' not found", operand)) - } - } - // Check if it's a numeric literal - else if operand.chars().all(|c| c.is_digit(10) || c == '.') { - operand.parse::().map_err(|_| format!("Cannot parse '{}' as a number", operand)) - } - // Check if it's a string literal - else if operand.starts_with('"') && operand.ends_with('"') { - Err(format!("Cannot perform arithmetic on string literal '{}'", operand)) - } - // Parse it as a number - else { - operand.parse::().map_err(|_| format!("Cannot parse '{}' as a number", operand)) - } - }, - shared::query::ArithmeticExpression::Add(left, right) => { - let left_val = self.evaluate_arithmetic_expression(result, left)?; - let right_val = self.evaluate_arithmetic_expression(result, right)?; - Ok(left_val + right_val) - }, - shared::query::ArithmeticExpression::Subtract(left, right) => { - let left_val = self.evaluate_arithmetic_expression(result, left)?; - let right_val = self.evaluate_arithmetic_expression(result, right)?; - Ok(left_val - right_val) - }, - shared::query::ArithmeticExpression::Multiply(left, right) => { - let left_val = self.evaluate_arithmetic_expression(result, left)?; - let right_val = self.evaluate_arithmetic_expression(result, right)?; - Ok(left_val * right_val) - }, - shared::query::ArithmeticExpression::Divide(left, right) => { - let left_val = self.evaluate_arithmetic_expression(result, left)?; - let right_val = self.evaluate_arithmetic_expression(result, right)?; - if right_val == 0.0 { - Err("Division by zero".to_string()) - } else { - Ok(left_val / right_val) - } - } - } - } - // Helper function to parse and evaluate an arithmetic expression from a string - fn evaluate_arithmetic_string<'a>( - &self, - result: &BTreeMap<&'a str, String>, - expr_str: &'a str - ) -> Result { - // Check for parenthesized expressions and remove them if needed - let expr_to_parse = if expr_str.starts_with('(') && expr_str.ends_with(')') { - &expr_str[1..expr_str.len()-1] - } else { - expr_str - }; - - if expr_to_parse.contains('+') || expr_to_parse.contains('-') || - expr_to_parse.contains('*') || expr_to_parse.contains('/') { - // Parse the expression string into an ArithmeticExpression - match parser::parse_arithmetic_expression(expr_to_parse) { - Ok((_, arithmetic_expr)) => { - // Evaluate the parsed expression - self.evaluate_arithmetic_expression(result, &arithmetic_expr) - }, - Err(e) => { - // Print the error - eprintln!("Failed to parse arithmetic expression '{}': {:?}", expr_to_parse, e); - - // If parsing fails, try to treat it as a simple operand - if expr_to_parse.starts_with('?') { - // It's a variable - if let Some(var_value) = result.get(expr_to_parse) { - var_value.parse::().map_err(|_| format!("Cannot parse '{}' as a number", var_value)) - } else { - Err(format!("Variable '{}' not found", expr_to_parse)) + #[cfg(target_arch = "aarch64")] + { + unsafe { + while i + 16 <= var_len { + let var_chunk = vld1q_u8(var_bytes[i..].as_ptr()); + let filter_chunk = vld1q_u8(filter_bytes[i..].as_ptr()); + let cmp = vceqq_u8(var_chunk, filter_chunk); + let cmp_arr: [u8; 16] = std::mem::transmute(cmp); + if cmp_arr.iter().any(|&lane| lane != 0xFF) { + return match *operator { + "=" => false, + "!=" => true, + _ => false, + }; + } + i += 16; } - } else { - // Parse as a number - expr_to_parse.parse::().map_err(|_| format!("Cannot parse '{}' as a number", expr_to_parse)) + } } - } - } - } else { - // No arithmetic operators, treat as simple operand - if expr_to_parse.starts_with('?') { - // It's a variable - if let Some(var_value) = result.get(expr_to_parse) { - var_value.parse::().map_err(|_| format!("Cannot parse '{}' as a number", var_value)) - } else { - Err(format!("Variable '{}' not found", expr_to_parse)) - } - } else { - // Parse as a number - expr_to_parse.parse::().map_err(|_| format!("Cannot parse '{}' as a number", expr_to_parse)) - } - } - } - // Helper method to evaluate a filter expression against a result - fn evaluate_filter_expression<'a>( - &self, - result: &BTreeMap<&'a str, String>, - filter_expr: &FilterExpression<'a> - ) -> bool { - match filter_expr { - FilterExpression::Comparison(left, operator, right) => { - // Evaluate both sides as arithmetic expressions - let left_result = self.evaluate_arithmetic_string(result, left); - let right_result = self.evaluate_arithmetic_string(result, right); - - match (left_result, right_result) { - (Ok(left_val), Ok(right_val)) => { - // Both sides are numeric, perform numeric comparison - match *operator { - "=" => left_val == right_val, - "!=" => left_val != right_val, - ">" => left_val > right_val, - ">=" => left_val >= right_val, - "<" => left_val < right_val, - "<=" => left_val <= right_val, + // Handle remaining bytes + if i < var_len { + for j in i..var_len { + if var_bytes[j] != filter_bytes[j] { + return match *operator { + "=" => false, + "!=" => true, _ => false, + }; } - }, - _ => { - let left_str = if left.starts_with('?') { - // Fix for the type mismatch error - convert to string - match result.get(left) { - Some(val) => val.as_str(), - None => left, - } - } else { - left - }; - - let right_str = if right.starts_with('?') { - // Fix for the type mismatch error - convert to string - match result.get(right) { - Some(val) => val.as_str(), - None => right, - } - } else { - right - }; - - match *operator { - "=" => left_str == right_str, - "!=" => left_str != right_str, - _ => false, // Other operators not supported for strings - } + } } + + // Strings are equal + match *operator { + "=" => true, + "!=" => false, + _ => false, // Other operators not supported for strings + } + } + } else { + false } + } }, FilterExpression::And(left, right) => { - self.evaluate_filter_expression(result, left) && - self.evaluate_filter_expression(result, right) + self.evaluate_filter_expression(result, left) && + self.evaluate_filter_expression(result, right) }, FilterExpression::Or(left, right) => { - self.evaluate_filter_expression(result, left) || - self.evaluate_filter_expression(result, right) + self.evaluate_filter_expression(result, left) || + self.evaluate_filter_expression(result, right) }, FilterExpression::Not(expr) => { - !self.evaluate_filter_expression(result, expr) + !self.evaluate_filter_expression(result, expr) }, FilterExpression::ArithmeticExpr(expr_str) => { - // An arithmetic expression by itself is evaluated to true if it's non-zero - match self.evaluate_arithmetic_string(result, expr_str) { - Ok(val) => val != 0.0, - Err(_) => false, - } + // True if it's non-zero + match self.evaluate_arithmetic_string(result, expr_str) { + Ok(val) => val != 0.0, + Err(_) => false, + } } - } - } - - pub fn union(&mut self, other: &SparqlDatabase) -> Self { - // Create a new dictionary by cloning and merging - let self_dict = self.dictionary.read().unwrap(); - let other_dict = other.dictionary.read().unwrap(); - let mut merged_dictionary = self_dict.clone(); - drop(self_dict); - - // Re-encode triples from the other database using the merged dictionary - let mut re_encoded_triples = BTreeSet::new(); - for triple in &other.triples { - let subject = - merged_dictionary.encode(other_dict.decode(triple.subject).unwrap()); - let predicate = - merged_dictionary.encode(other_dict.decode(triple.predicate).unwrap()); - let object = merged_dictionary.encode(other_dict.decode(triple.object).unwrap()); - re_encoded_triples.insert(Triple { - subject, - predicate, - object, - }); - } - - // Merge the triples and streams - let union_triples: BTreeSet = - self.triples.union(&re_encoded_triples).cloned().collect(); - let mut union_streams = self.streams.clone(); - for ts_triple in &other.streams { - let subject = merged_dictionary - .encode(other_dict.decode(ts_triple.triple.subject).unwrap()); - let predicate = merged_dictionary - .encode(other_dict.decode(ts_triple.triple.predicate).unwrap()); - let object = - merged_dictionary.encode(other_dict.decode(ts_triple.triple.object).unwrap()); - let re_encoded_ts_triple = TimestampedTriple { - triple: Triple { - subject, - predicate, - object, - }, - timestamp: ts_triple.timestamp, + } + }) + }) + .collect() + } + + // Helper function to evaluate an arithmetic expression + fn evaluate_arithmetic_expression<'a>( + &self, + result: &BTreeMap<&'a str, String>, + expr: &shared::query::ArithmeticExpression<'a> + ) -> Result { + match expr { + shared::query::ArithmeticExpression::Operand(operand) => { + // Check if it's a variable + if operand.starts_with('?') { + if let Some(var_value) = result.get(*operand) { + // Parse the variable value as a number + var_value.parse::().map_err(|_| format!("Cannot parse '{}' as a number", var_value)) + } else { + Err(format!("Variable '{}' not found", operand)) + } + } + // Check if it's a numeric literal + else if operand.chars().all(|c| c.is_digit(10) || c == '.') { + operand.parse::().map_err(|_| format!("Cannot parse '{}' as a number", operand)) + } + // Check if it's a string literal + else if operand.starts_with('"') && operand.ends_with('"') { + Err(format!("Cannot perform arithmetic on string literal '{}'", operand)) + } + // Parse it as a number + else { + operand.parse::().map_err(|_| format!("Cannot parse '{}' as a number", operand)) + } + }, + shared::query::ArithmeticExpression::Add(left, right) => { + let left_val = self.evaluate_arithmetic_expression(result, left)?; + let right_val = self.evaluate_arithmetic_expression(result, right)?; + Ok(left_val + right_val) + }, + shared::query::ArithmeticExpression::Subtract(left, right) => { + let left_val = self.evaluate_arithmetic_expression(result, left)?; + let right_val = self.evaluate_arithmetic_expression(result, right)?; + Ok(left_val - right_val) + }, + shared::query::ArithmeticExpression::Multiply(left, right) => { + let left_val = self.evaluate_arithmetic_expression(result, left)?; + let right_val = self.evaluate_arithmetic_expression(result, right)?; + Ok(left_val * right_val) + }, + shared::query::ArithmeticExpression::Divide(left, right) => { + let left_val = self.evaluate_arithmetic_expression(result, left)?; + let right_val = self.evaluate_arithmetic_expression(result, right)?; + if right_val == 0.0 { + Err("Division by zero".to_string()) + } else { + Ok(left_val / right_val) + } + } + } + } + + // Helper function to parse and evaluate an arithmetic expression from a string + fn evaluate_arithmetic_string<'a>( + &self, + result: &BTreeMap<&'a str, String>, + expr_str: &'a str + ) -> Result { + // Check for parenthesized expressions and remove them if needed + let expr_to_parse = if expr_str.starts_with('(') && expr_str.ends_with(')') { + &expr_str[1..expr_str.len()-1] + } else { + expr_str + }; + + if expr_to_parse.contains('+') || expr_to_parse.contains('-') || + expr_to_parse.contains('*') || expr_to_parse.contains('/') { + // Parse the expression string into an ArithmeticExpression + match parser::parse_arithmetic_expression(expr_to_parse) { + Ok((_, arithmetic_expr)) => { + // Evaluate the parsed expression + self.evaluate_arithmetic_expression(result, &arithmetic_expr) + }, + Err(e) => { + // Print the error + eprintln!("Failed to parse arithmetic expression '{}': {:?}", expr_to_parse, e); + + // If parsing fails, try to treat it as a simple operand + if expr_to_parse.starts_with('?') { + // It's a variable + if let Some(var_value) = result.get(expr_to_parse) { + var_value.parse::().map_err(|_| format!("Cannot parse '{}' as a number", var_value)) + } else { + Err(format!("Variable '{}' not found", expr_to_parse)) + } + } else { + // Parse as a number + expr_to_parse.parse::().map_err(|_| format!("Cannot parse '{}' as a number", expr_to_parse)) + } + } + } + } else { + // No arithmetic operators, treat as simple operand + if expr_to_parse.starts_with('?') { + // It's a variable + if let Some(var_value) = result.get(expr_to_parse) { + var_value.parse::().map_err(|_| format!("Cannot parse '{}' as a number", var_value)) + } else { + Err(format!("Variable '{}' not found", expr_to_parse)) + } + } else { + // Parse as a number + expr_to_parse.parse::().map_err(|_| format!("Cannot parse '{}' as a number", expr_to_parse)) + } + } + } + + // Helper method to evaluate a filter expression against a result + fn evaluate_filter_expression<'a>( + &self, + result: &BTreeMap<&'a str, String>, + filter_expr: &FilterExpression<'a> + ) -> bool { + match filter_expr { + FilterExpression::Comparison(left, operator, right) => { + // Evaluate both sides as arithmetic expressions + let left_result = self.evaluate_arithmetic_string(result, left); + let right_result = self.evaluate_arithmetic_string(result, right); + + match (left_result, right_result) { + (Ok(left_val), Ok(right_val)) => { + // Both sides are numeric, perform numeric comparison + match *operator { + "=" => left_val == right_val, + "!=" => left_val != right_val, + ">" => left_val > right_val, + ">=" => left_val >= right_val, + "<" => left_val < right_val, + "<=" => left_val <= right_val, + _ => false, + } + }, + _ => { + let left_str = if left.starts_with('?') { + // Fix for the type mismatch error - convert to string + match result.get(left) { + Some(val) => val.as_str(), + None => left, + } + } else { + left }; - if !union_streams.contains(&re_encoded_ts_triple) { - union_streams.push(re_encoded_ts_triple); - } - } - drop(other_dict); - - Self { - triples: union_triples, - streams: union_streams, - sliding_window: self.sliding_window.clone(), - dictionary: Arc::new(RwLock::new(merged_dictionary)), - prefixes: self.prefixes.clone(), - udfs: HashMap::new(), - index_manager: self.index_manager.clone_empty(), - rule_map: HashMap::new(), - cached_stats: None, - } - } - - pub fn par_join(&mut self, other: &SparqlDatabase, predicate: &str) -> Self { - let mut dict = self.dictionary.write().unwrap(); - let predicate_id = dict.encode(predicate); - drop(dict); - let other_map: BTreeMap<&u32, Vec<&Triple>> = other - .triples - .par_iter() - .filter(|other_triple| other_triple.predicate == predicate_id) - .flat_map(|other_triple| { - vec![ - (&other_triple.subject, other_triple), - (&other_triple.object, other_triple), - ] - }) - .fold( - || BTreeMap::new(), - |mut acc, (key, triple)| { - acc.entry(key).or_insert_with(Vec::new).push(triple); - acc - }, - ) - .reduce( - || BTreeMap::new(), - |mut acc, map| { - for (key, triples) in map { - acc.entry(key).or_insert_with(Vec::new).extend(triples); - } - acc - }, - ); - - let joined_triples: BTreeSet = self - .triples - .par_iter() - .filter(|triple| triple.predicate == predicate_id) - .fold( - || BTreeSet::new(), - |mut local_set, triple| { - if let Some(matching_triples) = other_map.get(&triple.object) { - for other_triple in matching_triples { - local_set.insert(Triple { - subject: triple.subject, - predicate: other_triple.predicate, - object: other_triple.object, - }); - } - } - local_set - }, - ) - .reduce( - || BTreeSet::new(), - |mut set1, set2| { - set1.extend(set2); - set1 - }, - ); - - Self { - triples: joined_triples, - streams: self.streams.clone(), - sliding_window: self.sliding_window.clone(), - dictionary: Arc::clone(&self.dictionary), - prefixes: self.prefixes.clone(), - udfs: HashMap::new(), - index_manager: self.index_manager.clone_empty(), - rule_map: HashMap::new(), - cached_stats: None, - } - } - - pub fn perform_join<'a>( - &self, - subject_var: &'a str, - predicate: &'a str, - object_var: &'a str, - triples: Vec, - dictionary: &'a Dictionary, - final_results: Vec>, - ) -> Vec> { - let mut new_results = Vec::new(); - - for triple in triples { - let subject = dictionary.decode(triple.subject).unwrap(); - let pred = dictionary.decode(triple.predicate).unwrap(); - let object = dictionary.decode(triple.object).unwrap(); - - if pred == predicate { - for result in &final_results { - let mut extended_result = result.clone(); - let mut valid_extension = true; - - // Check and extend the result with the subject - if let Some(existing_subject) = extended_result.get(subject_var) { - if existing_subject != &subject { - valid_extension = false; - } - } else { - extended_result.insert(subject_var, subject.to_string()); - } - - // Check and extend the result with the object - if let Some(existing_object) = extended_result.get(object_var) { - if existing_object != &object { - valid_extension = false; - } - } else { - extended_result.insert(object_var, object.to_string()); - } - - if valid_extension { - new_results.push(extended_result); - } - } - } - } - - new_results - } - - pub fn perform_join_par_simd_with_strict_filter_1<'a>( - &self, - subject_var: &'a str, - predicate: String, - object_var: &'a str, - triples: Vec, - dictionary: &Arc>, - final_results: Vec>, - literal_filter: Option, - ) -> Vec> { - if final_results.is_empty() { - return Vec::new(); - } - - let dictionary = dictionary.read().unwrap(); - - let predicate_bytes = predicate.as_bytes(); - let literal_filter_bytes = literal_filter.as_ref().map(|s| s.as_bytes()); - - // Partition final_results into groups based on variable bindings - let mut both_vars_bound: HashMap<(String, String), Vec>> = - HashMap::new(); - let mut subject_var_bound: HashMap>> = HashMap::new(); - let mut object_var_bound: HashMap>> = HashMap::new(); - let mut neither_var_bound: Vec> = Vec::new(); - - for result in final_results { - let subject_binding = result.get(subject_var).cloned(); - let object_binding = result.get(object_var).cloned(); - - match (subject_binding, object_binding) { - (Some(subj_val), Some(obj_val)) => { - both_vars_bound - .entry((subj_val.clone(), obj_val.clone())) - .or_default() - .push(result); - } - (Some(subj_val), None) => { - subject_var_bound - .entry(subj_val.clone()) - .or_default() - .push(result); - } - (None, Some(obj_val)) => { - object_var_bound - .entry(obj_val.clone()) - .or_default() - .push(result); - } - (None, None) => { - neither_var_bound.push(result); - } - } - } - - // Pre-allocate output vector - let results = Mutex::new(Vec::new()); - - // Using Rayon for parallel processing - triples.par_chunks(256).for_each(|chunk| { - let mut local_results = Vec::new(); - - for triple in chunk { - if let (Some(subject), Some(pred), Some(object)) = ( - dictionary.decode(triple.subject), - dictionary.decode(triple.predicate), - dictionary.decode(triple.object), - ) { - // SIMD predicate comparison - if pred.as_bytes() != predicate_bytes { - continue; - } - - // SIMD literal filter comparison - if let Some(filter_bytes) = literal_filter_bytes { - if object.as_bytes() != filter_bytes { - continue; - } - } - - // Process group both_vars_bound - { - let key = (subject.to_string(), object.to_string()); - if let Some(results_vec) = both_vars_bound.get(&key) { - for result in results_vec { - let extended_result = result.clone(); - local_results.push(extended_result); - } - } - } - // Process group subject_var_bound - { - if let Some(results_vec) = subject_var_bound.get(subject) { - for result in results_vec { - let mut extended_result = result.clone(); - // Extend object_var - if let Some(existing_object) = extended_result.get(object_var) { - if existing_object != &object { - continue; // Inconsistent variable binding - } - } else { - extended_result.insert(object_var, object.to_string()); - } - local_results.push(extended_result); - } - } - } - - // Process group object_var_bound - { - if let Some(results_vec) = object_var_bound.get(object) { - for result in results_vec { - let mut extended_result = result.clone(); - // Extend subject_var - if let Some(existing_subject) = extended_result.get(subject_var) { - if existing_subject != &subject { - continue; // Inconsistent variable binding - } - } else { - extended_result.insert(subject_var, subject.to_string()); - } - local_results.push(extended_result); - } - } - } - - // Process group neither_var_bound - for result in &neither_var_bound { - let mut extended_result = result.clone(); - // Extend subject_var - if let Some(existing_subject) = extended_result.get(subject_var) { - if existing_subject != &subject { - continue; // Inconsistent variable binding - } - } else { - extended_result.insert(subject_var, subject.to_string()); - } - // Extend object_var - if let Some(existing_object) = extended_result.get(object_var) { - if existing_object != &object { - continue; // Inconsistent variable binding - } - } else { - extended_result.insert(object_var, object.to_string()); - } - local_results.push(extended_result); - } - } - } - - // Push local results to the shared results vector - let mut global_results = results.lock().unwrap(); - global_results.extend(local_results); - }); - - results.into_inner().unwrap() - } - - pub fn perform_join_par_simd_with_strict_filter_2<'a>( - &self, - subject_var: &'a str, - predicate: String, - object_var: &'a str, - triples: Vec, - dictionary: &'a Dictionary, - final_results: Vec>, - literal_filter: Option, - ) -> Vec> { - if final_results.is_empty() { - return Vec::new(); - } - - let predicate_bytes = predicate.as_bytes(); - let literal_filter_bytes = literal_filter.as_ref().map(|s| s.as_bytes()); - - // Partition final_results into groups based on variable bindings. - let mut both_vars_bound: HashMap<(String, String), Vec>> = HashMap::new(); - let mut subject_var_bound: HashMap>> = HashMap::new(); - let mut object_var_bound: HashMap>> = HashMap::new(); - let mut neither_var_bound: Vec> = Vec::new(); - - for result in final_results { - let subject_binding = result.get(subject_var).cloned(); - let object_binding = result.get(object_var).cloned(); - - match (subject_binding, object_binding) { - (Some(subj_val), Some(obj_val)) => { - both_vars_bound - .entry((subj_val.clone(), obj_val.clone())) - .or_default() - .push(result); - } - (Some(subj_val), None) => { - subject_var_bound.entry(subj_val.clone()).or_default().push(result); - } - (None, Some(obj_val)) => { - object_var_bound.entry(obj_val.clone()).or_default().push(result); - } - (None, None) => { - neither_var_bound.push(result); - } - } - } - - // Pre-allocate output vector. - let results = Mutex::new(Vec::new()); - - // Using Rayon for parallel processing. - triples.par_chunks(256).for_each(|chunk| { - let mut local_results = Vec::new(); - - for triple in chunk { - if let (Some(subject), Some(pred), Some(object)) = ( - dictionary.decode(triple.subject), - dictionary.decode(triple.predicate), - dictionary.decode(triple.object), - ) { - // SIMD predicate comparison using simd_eq. - if !unsafe { simd_eq(pred.as_bytes(), predicate_bytes) } { - continue; - } - - // SIMD literal filter comparison. - if let Some(filter_bytes) = literal_filter_bytes { - if !unsafe { simd_eq(object.as_bytes(), filter_bytes) } { - continue; - } - } + let right_str = if right.starts_with('?') { + // Fix for the type mismatch error - convert to string + match result.get(right) { + Some(val) => val.as_str(), + None => right, + } + } else { + right + }; - // Process group both_vars_bound. - { - let key = (subject.to_string(), object.to_string()); - if let Some(results_vec) = both_vars_bound.get(&key) { - for result in results_vec { - local_results.push(result.clone()); - } - } - } + match *operator { + "=" => left_str == right_str, + "!=" => left_str != right_str, + _ => false, // Other operators not supported for strings + } + } + } + }, + FilterExpression::And(left, right) => { + self.evaluate_filter_expression(result, left) && + self.evaluate_filter_expression(result, right) + }, + FilterExpression::Or(left, right) => { + self.evaluate_filter_expression(result, left) || + self.evaluate_filter_expression(result, right) + }, + FilterExpression::Not(expr) => { + !self.evaluate_filter_expression(result, expr) + }, + FilterExpression::ArithmeticExpr(expr_str) => { + // An arithmetic expression by itself is evaluated to true if it's non-zero + match self.evaluate_arithmetic_string(result, expr_str) { + Ok(val) => val != 0.0, + Err(_) => false, + } + } + } + } + + pub fn union(&mut self, other: &SparqlDatabase) -> Self { + // Create a new dictionary by cloning and merging + let self_dict = self.dictionary.read().unwrap(); + let other_dict = other.dictionary.read().unwrap(); + let mut merged_dictionary = self_dict.clone(); + drop(self_dict); + + // Re-encode triples from the other database using the merged dictionary + let mut re_encoded_triples = BTreeSet::new(); + for triple in &other.triples { + let subject = + merged_dictionary.encode(other_dict.decode(triple.subject).unwrap()); + let predicate = + merged_dictionary.encode(other_dict.decode(triple.predicate).unwrap()); + let object = merged_dictionary.encode(other_dict.decode(triple.object).unwrap()); + re_encoded_triples.insert(Triple { + subject, + predicate, + object, + }); + } + + // Merge the triples and streams + let union_triples: BTreeSet = + self.triples.union(&re_encoded_triples).cloned().collect(); + let mut union_streams = self.streams.clone(); + for ts_triple in &other.streams { + let subject = merged_dictionary + .encode(other_dict.decode(ts_triple.triple.subject).unwrap()); + let predicate = merged_dictionary + .encode(other_dict.decode(ts_triple.triple.predicate).unwrap()); + let object = + merged_dictionary.encode(other_dict.decode(ts_triple.triple.object).unwrap()); + let re_encoded_ts_triple = TimestampedTriple { + triple: Triple { + subject, + predicate, + object, + }, + timestamp: ts_triple.timestamp, + }; + if !union_streams.contains(&re_encoded_ts_triple) { + union_streams.push(re_encoded_ts_triple); + } + } + drop(other_dict); + + Self { + triples: union_triples, + streams: union_streams, + sliding_window: self.sliding_window.clone(), + dictionary: Arc::new(RwLock::new(merged_dictionary)), + prefixes: self.prefixes.clone(), + udfs: HashMap::new(), + index_manager: Some(self.index().clone_empty()), + rule_map: HashMap::new(), + cached_stats: None, + index_config: self.index_config.clone() + } + } + + pub fn par_join(&mut self, other: &SparqlDatabase, predicate: &str) -> Self { + let mut dict = self.dictionary.write().unwrap(); + let predicate_id = dict.encode(predicate); + drop(dict); + let other_map: BTreeMap<&u32, Vec<&Triple>> = other + .triples + .par_iter() + .filter(|other_triple| other_triple.predicate == predicate_id) + .flat_map(|other_triple| { + vec![ + (&other_triple.subject, other_triple), + (&other_triple.object, other_triple), + ] + }) + .fold( + || BTreeMap::new(), + |mut acc, (key, triple)| { + acc.entry(key).or_insert_with(Vec::new).push(triple); + acc + }, + ) + .reduce( + || BTreeMap::new(), + |mut acc, map| { + for (key, triples) in map { + acc.entry(key).or_insert_with(Vec::new).extend(triples); + } + acc + }, + ); + + let joined_triples: BTreeSet = self + .triples + .par_iter() + .filter(|triple| triple.predicate == predicate_id) + .fold( + || BTreeSet::new(), + |mut local_set, triple| { + if let Some(matching_triples) = other_map.get(&triple.object) { + for other_triple in matching_triples { + local_set.insert(Triple { + subject: triple.subject, + predicate: other_triple.predicate, + object: other_triple.object, + }); + } + } + local_set + }, + ) + .reduce( + || BTreeSet::new(), + |mut set1, set2| { + set1.extend(set2); + set1 + }, + ); + + Self { + triples: joined_triples, + streams: self.streams.clone(), + sliding_window: self.sliding_window.clone(), + dictionary: Arc::clone(&self.dictionary), + prefixes: self.prefixes.clone(), + udfs: HashMap::new(), + index_manager: Some(self.index().clone_empty()), + rule_map: HashMap::new(), + cached_stats: None, + index_config: self.index_config.clone(), + } + } + + pub fn perform_join<'a>( + &self, + subject_var: &'a str, + predicate: &'a str, + object_var: &'a str, + triples: Vec, + dictionary: &'a Dictionary, + final_results: Vec>, + ) -> Vec> { + let mut new_results = Vec::new(); - // Process group subject_var_bound. - { - if let Some(results_vec) = subject_var_bound.get(subject) { - for result in results_vec { - let mut extended_result = result.clone(); - // Extend object_var. - if let Some(existing_object) = extended_result.get(object_var) { - if existing_object != &object { - continue; // Inconsistent variable binding. - } - } else { - extended_result.insert(object_var, object.to_string()); - } - local_results.push(extended_result); - } - } - } + for triple in triples { + let subject = dictionary.decode(triple.subject).unwrap(); + let pred = dictionary.decode(triple.predicate).unwrap(); + let object = dictionary.decode(triple.object).unwrap(); - // Process group object_var_bound. - { - if let Some(results_vec) = object_var_bound.get(object) { - for result in results_vec { - let mut extended_result = result.clone(); - // Extend subject_var. - if let Some(existing_subject) = extended_result.get(subject_var) { - if existing_subject != &subject { - continue; // Inconsistent variable binding. - } - } else { - extended_result.insert(subject_var, subject.to_string()); - } - local_results.push(extended_result); - } - } - } + if pred == predicate { + for result in &final_results { + let mut extended_result = result.clone(); + let mut valid_extension = true; - // Process group neither_var_bound. - for result in &neither_var_bound { - let mut extended_result = result.clone(); - // Extend subject_var. - if let Some(existing_subject) = extended_result.get(subject_var) { - if existing_subject != &subject { - continue; // Inconsistent variable binding. - } - } else { - extended_result.insert(subject_var, subject.to_string()); - } - // Extend object_var. - if let Some(existing_object) = extended_result.get(object_var) { - if existing_object != &object { - continue; // Inconsistent variable binding. - } - } else { - extended_result.insert(object_var, object.to_string()); - } - local_results.push(extended_result); - } - } + // Check and extend the result with the subject + if let Some(existing_subject) = extended_result.get(subject_var) { + if existing_subject != &subject { + valid_extension = false; } + } else { + extended_result.insert(subject_var, subject.to_string()); + } - // Push local results to the shared results vector. - let mut global_results = results.lock().unwrap(); - global_results.extend(local_results); - }); - - results.into_inner().unwrap() - } - - pub fn perform_join_sequential<'a>( - &self, - subject_var: &'a str, - predicate: String, - object_var: &'a str, - triples: Vec, - dictionary: &'a Dictionary, - final_results: Vec>, - literal_filter: Option, - ) -> Vec> { - if final_results.is_empty() { - return Vec::new(); - } - - let predicate_bytes = predicate.as_bytes(); - let literal_filter_bytes = literal_filter.as_ref().map(|s| s.as_bytes()); - - // Partition final_results into groups based on variable bindings. - let mut both_vars_bound: HashMap<(String, String), Vec>> = - HashMap::new(); - let mut subject_var_bound: HashMap>> = HashMap::new(); - let mut object_var_bound: HashMap>> = HashMap::new(); - let mut neither_var_bound: Vec> = Vec::new(); - - for result in final_results { - let subject_binding = result.get(subject_var).cloned(); - let object_binding = result.get(object_var).cloned(); - - match (subject_binding, object_binding) { - (Some(subj_val), Some(obj_val)) => { - both_vars_bound - .entry((subj_val.clone(), obj_val.clone())) - .or_default() - .push(result); - } - (Some(subj_val), None) => { - subject_var_bound - .entry(subj_val.clone()) - .or_default() - .push(result); - } - (None, Some(obj_val)) => { - object_var_bound - .entry(obj_val.clone()) - .or_default() - .push(result); - } - (None, None) => { - neither_var_bound.push(result); - } + // Check and extend the result with the object + if let Some(existing_object) = extended_result.get(object_var) { + if existing_object != &object { + valid_extension = false; } - } - - let mut results = Vec::new(); - - // Process triples sequentially. - for triple in triples { - if let (Some(subject), Some(pred), Some(object)) = ( - dictionary.decode(triple.subject), - dictionary.decode(triple.predicate), - dictionary.decode(triple.object), - ) { - // Check if the predicate matches. - if pred.as_bytes() != predicate_bytes { - continue; - } + } else { + extended_result.insert(object_var, object.to_string()); + } - // Check the literal filter if provided. - if let Some(filter_bytes) = literal_filter_bytes { - if object.as_bytes() != filter_bytes { - continue; - } - } - - // Process group where both variables are already bound. - { - let key = (subject.to_string(), object.to_string()); - if let Some(results_vec) = both_vars_bound.get(&key) { - for result in results_vec { - results.push(result.clone()); - } - } - } - - // Process group where only subject_var is bound. - { - if let Some(results_vec) = subject_var_bound.get(subject) { - for result in results_vec { - let mut extended_result = result.clone(); - // Extend the object_var binding. - if let Some(existing_object) = extended_result.get(object_var) { - if existing_object != &object { - continue; // Inconsistent variable binding. - } - } else { - extended_result.insert(object_var, object.to_string()); - } - results.push(extended_result); - } - } - } - - // Process group where only object_var is bound. - { - if let Some(results_vec) = object_var_bound.get(object) { - for result in results_vec { - let mut extended_result = result.clone(); - // Extend the subject_var binding. - if let Some(existing_subject) = extended_result.get(subject_var) { - if existing_subject != &subject { - continue; // Inconsistent variable binding. - } - } else { - extended_result.insert(subject_var, subject.to_string()); - } - results.push(extended_result); - } - } - } - - // Process group where neither variable is bound. - for result in &neither_var_bound { - let mut extended_result = result.clone(); - // Extend the subject_var binding. - if let Some(existing_subject) = extended_result.get(subject_var) { - if existing_subject != &subject { - continue; // Inconsistent variable binding. - } - } else { - extended_result.insert(subject_var, subject.to_string()); - } - // Extend the object_var binding. - if let Some(existing_object) = extended_result.get(object_var) { - if existing_object != &object { - continue; // Inconsistent variable binding. - } - } else { - extended_result.insert(object_var, object.to_string()); - } - results.push(extended_result); - } - } + if valid_extension { + new_results.push(extended_result); + } } - - results + } } - pub fn perform_join_sequential_simd<'a>( - &self, - subject_var: &'a str, - predicate: String, - object_var: &'a str, - triples: Vec, - dictionary: &'a Dictionary, - final_results: Vec>, - literal_filter: Option, - ) -> Vec> { - if final_results.is_empty() { - return Vec::new(); - } - - let predicate_bytes = predicate.as_bytes(); - let literal_filter_bytes = literal_filter.as_ref().map(|s| s.as_bytes()); - - // Partition final_results into groups based on variable bindings. - let mut both_vars_bound: HashMap<(String, String), Vec>> = - HashMap::new(); - let mut subject_var_bound: HashMap>> = HashMap::new(); - let mut object_var_bound: HashMap>> = HashMap::new(); - let mut neither_var_bound: Vec> = Vec::new(); - - for result in final_results { - let subject_binding = result.get(subject_var).cloned(); - let object_binding = result.get(object_var).cloned(); - - match (subject_binding, object_binding) { - (Some(subj_val), Some(obj_val)) => { - both_vars_bound - .entry((subj_val.clone(), obj_val.clone())) - .or_default() - .push(result); - } - (Some(subj_val), None) => { - subject_var_bound - .entry(subj_val.clone()) - .or_default() - .push(result); - } - (None, Some(obj_val)) => { - object_var_bound - .entry(obj_val.clone()) - .or_default() - .push(result); - } - (None, None) => { - neither_var_bound.push(result); - } - } - } - - let mut results = Vec::new(); - - // Process triples sequentially. - for triple in triples { - if let (Some(subject), Some(pred), Some(object)) = ( - dictionary.decode(triple.subject), - dictionary.decode(triple.predicate), - dictionary.decode(triple.object), - ) { - // Use SIMD-based comparison for the predicate. - if !simd_bytes_eq(pred.as_bytes(), predicate_bytes) { - continue; - } - - // Use SIMD-based comparison for the literal filter if provided. - if let Some(filter_bytes) = literal_filter_bytes { - if !simd_bytes_eq(object.as_bytes(), filter_bytes) { - continue; - } - } - - // Process group where both variables are already bound. - { - let key = (subject.to_string(), object.to_string()); - if let Some(results_vec) = both_vars_bound.get(&key) { - for result in results_vec { - results.push(result.clone()); - } - } - } + new_results + } - // Process group where only subject_var is bound. - { - if let Some(results_vec) = subject_var_bound.get(subject) { - for result in results_vec { - let mut extended_result = result.clone(); - // Extend the object_var binding. - if let Some(existing_object) = extended_result.get(object_var) { - if existing_object != &object { - continue; // Inconsistent variable binding. - } - } else { - extended_result.insert(object_var, object.to_string()); - } - results.push(extended_result); - } - } - } - - // Process group where only object_var is bound. - { - if let Some(results_vec) = object_var_bound.get(object) { - for result in results_vec { - let mut extended_result = result.clone(); - // Extend the subject_var binding. - if let Some(existing_subject) = extended_result.get(subject_var) { - if existing_subject != &subject { - continue; // Inconsistent variable binding. - } - } else { - extended_result.insert(subject_var, subject.to_string()); - } - results.push(extended_result); - } - } - } + pub fn perform_join_par_simd_with_strict_filter_1<'a>( + &self, + subject_var: &'a str, + predicate: String, + object_var: &'a str, + triples: Vec, + dictionary: &Arc>, + final_results: Vec>, + literal_filter: Option, + ) -> Vec> { + if final_results.is_empty() { + return Vec::new(); + } + + let dictionary = dictionary.read().unwrap(); + + let predicate_bytes = predicate.as_bytes(); + let literal_filter_bytes = literal_filter.as_ref().map(|s| s.as_bytes()); + + // Partition final_results into groups based on variable bindings + let mut both_vars_bound: HashMap<(String, String), Vec>> = + HashMap::new(); + let mut subject_var_bound: HashMap>> = HashMap::new(); + let mut object_var_bound: HashMap>> = HashMap::new(); + let mut neither_var_bound: Vec> = Vec::new(); + + for result in final_results { + let subject_binding = result.get(subject_var).cloned(); + let object_binding = result.get(object_var).cloned(); + + match (subject_binding, object_binding) { + (Some(subj_val), Some(obj_val)) => { + both_vars_bound + .entry((subj_val.clone(), obj_val.clone())) + .or_default() + .push(result); + } + (Some(subj_val), None) => { + subject_var_bound + .entry(subj_val.clone()) + .or_default() + .push(result); + } + (None, Some(obj_val)) => { + object_var_bound + .entry(obj_val.clone()) + .or_default() + .push(result); + } + (None, None) => { + neither_var_bound.push(result); + } + } + } + + // Pre-allocate output vector + let results = Mutex::new(Vec::new()); + + // Using Rayon for parallel processing + triples.par_chunks(256).for_each(|chunk| { + let mut local_results = Vec::new(); + + for triple in chunk { + if let (Some(subject), Some(pred), Some(object)) = ( + dictionary.decode(triple.subject), + dictionary.decode(triple.predicate), + dictionary.decode(triple.object), + ) { + // SIMD predicate comparison + if pred.as_bytes() != predicate_bytes { + continue; + } - // Process group where neither variable is bound. - for result in &neither_var_bound { - let mut extended_result = result.clone(); - // Extend the subject_var binding. - if let Some(existing_subject) = extended_result.get(subject_var) { - if existing_subject != &subject { - continue; // Inconsistent variable binding. - } - } else { - extended_result.insert(subject_var, subject.to_string()); - } - // Extend the object_var binding. - if let Some(existing_object) = extended_result.get(object_var) { - if existing_object != &object { - continue; // Inconsistent variable binding. - } - } else { - extended_result.insert(object_var, object.to_string()); - } - results.push(extended_result); - } + // SIMD literal filter comparison + if let Some(filter_bytes) = literal_filter_bytes { + if object.as_bytes() != filter_bytes { + continue; } - } + } - results - } - - pub fn perform_join_par_simd_with_strict_filter_3<'a>( - &self, - subject_var: &'a str, - predicate: String, - object_var: &'a str, - triples: Vec, - dictionary: &'a Dictionary, - final_results: Vec>, - literal_filter: Option, - ) -> Vec> { - // Early return for empty joins - if final_results.is_empty() { - return Vec::new(); - } - - // Pre-fetch predicate and filter bytes to avoid string comparisons - let predicate_bytes = predicate.as_bytes(); - let literal_filter_bytes = literal_filter.as_ref().map(|s| s.as_bytes()); - - // Preallocate with capacity estimation to avoid rehashing - let estimated_capacity = (final_results.len() / 4).max(HASHMAP_INITIAL_CAPACITY); - - // Use with_capacity to preallocate hashmap space - let mut both_vars_bound: HashMap<(String, String), Vec> = - HashMap::with_capacity(estimated_capacity); - let mut subject_var_bound: HashMap> = - HashMap::with_capacity(estimated_capacity); - let mut object_var_bound: HashMap> = - HashMap::with_capacity(estimated_capacity); - let mut neither_var_bound: Vec = Vec::with_capacity(final_results.len() / 2); - - // Pre-compute and classify bindings - this is serial but much faster than doing it in parallel - for (idx, result) in final_results.iter().enumerate() { - let subject_binding = result.get(subject_var); - let object_binding = result.get(object_var); - - match (subject_binding, object_binding) { - (Some(subj_val), Some(obj_val)) => { - both_vars_bound - .entry((subj_val.clone(), obj_val.clone())) - .or_insert_with(|| Vec::with_capacity(4)) - .push(idx); - } - (Some(subj_val), None) => { - subject_var_bound - .entry(subj_val.clone()) - .or_insert_with(|| Vec::with_capacity(8)) - .push(idx); - } - (None, Some(obj_val)) => { - object_var_bound - .entry(obj_val.clone()) - .or_insert_with(|| Vec::with_capacity(8)) - .push(idx); - } - (None, None) => { - neither_var_bound.push(idx); - } - } - } - - // Immutable shared references for threading - let final_results_arc = Arc::new(final_results); - let both_vars_bound_arc = Arc::new(both_vars_bound); - let subject_var_bound_arc = Arc::new(subject_var_bound); - let object_var_bound_arc = Arc::new(object_var_bound); - let neither_var_bound_arc = Arc::new(neither_var_bound); - - // Calculate optimal chunk size based on available processors and dataset size - let chunk_size = (triples.len() / rayon::current_num_threads()).max(MIN_CHUNK_SIZE); - - // Process triples in chunks for better cache locality and load balancing - let results = triples - .par_chunks(chunk_size) - .flat_map(|triple_chunk| { - // Preallocate result vector for this chunk based on estimated hit rate - let mut local_results = Vec::with_capacity(triple_chunk.len() / 4); - - // Process each triple in the chunk - for triple in triple_chunk { - // Step 1: Quick predicate check first (early filter) - let pred_opt = dictionary.decode(triple.predicate); - if pred_opt.is_none() || pred_opt.as_ref().unwrap().as_bytes() != predicate_bytes { - continue; - } - - // Step 2: Filter check if needed - if let Some(filter_bytes) = &literal_filter_bytes { - let obj_opt = dictionary.decode(triple.object); - if obj_opt.is_none() || obj_opt.as_ref().unwrap().as_bytes() != *filter_bytes { - continue; - } - - // Decode subject only if predicate and object pass filters - if let Some(subj) = dictionary.decode(triple.subject) { - process_join( - &subj, - obj_opt.unwrap(), - subject_var, - object_var, - &both_vars_bound_arc, - &subject_var_bound_arc, - &object_var_bound_arc, - &neither_var_bound_arc, - &final_results_arc, - &mut local_results, - ); - } - } else { - // No filter - decode both subject and object - let subj_opt = dictionary.decode(triple.subject); - let obj_opt = dictionary.decode(triple.object); - - if let (Some(subj), Some(obj)) = (subj_opt, obj_opt) { - process_join( - &subj, - &obj, - subject_var, - object_var, - &both_vars_bound_arc, - &subject_var_bound_arc, - &object_var_bound_arc, - &neither_var_bound_arc, - &final_results_arc, - &mut local_results, - ); - } - } - } - - local_results - }) - .collect(); - - results - } - - pub fn perform_join_par_simd_with_strict_filter_4<'a>( - &self, - subject_var: &'a str, - predicate: String, - object_var: &'a str, - triples: Vec, - dictionary: &'a Dictionary, - final_results: Vec>, - literal_filter: Option, - ) -> Vec> { - // Early return for empty joins - if final_results.is_empty() { - return Vec::new(); - } - - // Pre-fetch predicate and filter bytes to avoid string comparisons - let predicate_bytes = predicate.as_bytes(); - let literal_filter_bytes = literal_filter.as_ref().map(|s| s.as_bytes()); - - let estimated_capacity = (final_results.len() / 3).max(HASHMAP_INITIAL_CAPACITY1); - - let mut both_vars_bound: HashMap<(String, String), Vec> = - HashMap::with_capacity(estimated_capacity / 2); // This tends to be smaller - let mut subject_var_bound: HashMap> = - HashMap::with_capacity(estimated_capacity); - let mut object_var_bound: HashMap> = - HashMap::with_capacity(estimated_capacity); - let mut neither_var_bound: Vec = Vec::with_capacity(final_results.len() / 2); - - // Pre-compute and classify bindings - this is serial but much faster than doing it in parallel - for (idx, result) in final_results.iter().enumerate() { - let subject_binding = result.get(subject_var); - let object_binding = result.get(object_var); - - match (subject_binding, object_binding) { - (Some(subj_val), Some(obj_val)) => { - both_vars_bound - .entry((subj_val.clone(), obj_val.clone())) - .or_insert_with(|| Vec::with_capacity(4)) - .push(idx); - } - (Some(subj_val), None) => { - subject_var_bound - .entry(subj_val.clone()) - .or_insert_with(|| Vec::with_capacity(8)) - .push(idx); - } - (None, Some(obj_val)) => { - object_var_bound - .entry(obj_val.clone()) - .or_insert_with(|| Vec::with_capacity(8)) - .push(idx); - } - (None, None) => { - neither_var_bound.push(idx); + // Process group both_vars_bound + { + let key = (subject.to_string(), object.to_string()); + if let Some(results_vec) = both_vars_bound.get(&key) { + for result in results_vec { + let extended_result = result.clone(); + local_results.push(extended_result); + } + } + } + + // Process group subject_var_bound + { + if let Some(results_vec) = subject_var_bound.get(subject) { + for result in results_vec { + let mut extended_result = result.clone(); + // Extend object_var + if let Some(existing_object) = extended_result.get(object_var) { + if existing_object != &object { + continue; // Inconsistent variable binding + } + } else { + extended_result.insert(object_var, object.to_string()); } + local_results.push(extended_result); + } + } + } + + // Process group object_var_bound + { + if let Some(results_vec) = object_var_bound.get(object) { + for result in results_vec { + let mut extended_result = result.clone(); + // Extend subject_var + if let Some(existing_subject) = extended_result.get(subject_var) { + if existing_subject != &subject { + continue; // Inconsistent variable binding + } + } else { + extended_result.insert(subject_var, subject.to_string()); + } + local_results.push(extended_result); + } + } + } + + // Process group neither_var_bound + for result in &neither_var_bound { + let mut extended_result = result.clone(); + // Extend subject_var + if let Some(existing_subject) = extended_result.get(subject_var) { + if existing_subject != &subject { + continue; // Inconsistent variable binding + } + } else { + extended_result.insert(subject_var, subject.to_string()); } - } - - // Immutable shared references for threading - let final_results_arc = Arc::new(final_results); - let both_vars_bound_arc = Arc::new(both_vars_bound); - let subject_var_bound_arc = Arc::new(subject_var_bound); - let object_var_bound_arc = Arc::new(object_var_bound); - let neither_var_bound_arc = Arc::new(neither_var_bound); - - let chunk_size = ((triples.len() / rayon::current_num_threads()) * 3 / 2).max(MIN_CHUNK_SIZE1); - - let results = triples - .par_chunks(chunk_size) - .fold( - || Vec::with_capacity(chunk_size / 4), // Local vector capacity based on chunk size - |mut local_results, triple_chunk| { - // Create a local result buffer - process_triple_chunk( - triple_chunk, - predicate_bytes, - &literal_filter_bytes, - subject_var, - object_var, - &both_vars_bound_arc, - &subject_var_bound_arc, - &object_var_bound_arc, - &neither_var_bound_arc, - &final_results_arc, - &mut local_results, - dictionary, - ); - - local_results - }, - ) - .reduce( - || Vec::new(), - |mut acc, mut chunk| { - if acc.is_empty() { - return chunk; - } - if chunk.is_empty() { - return acc; - } - - // Pre-allocate to avoid reallocation during append - if acc.capacity() < acc.len() + chunk.len() { - acc.reserve(chunk.len()); - } - acc.append(&mut chunk); - acc - }, - ); - - results - } - - pub fn istream(&self, last_timestamp: u64) -> Vec { - let mut new_triples = vec![]; - for ts_triple in &self.streams { - if ts_triple.timestamp > last_timestamp { - new_triples.push(ts_triple.triple.clone()); - } - } - new_triples - } - - pub fn dstream(&self, last_timestamp: u64, current_timestamp: u64) -> Vec { - let mut old_triples = BTreeSet::new(); - let mut current_triples = BTreeSet::new(); - - for ts_triple in &self.streams { - if ts_triple.timestamp <= last_timestamp { - old_triples.insert(ts_triple.triple.clone()); - } - if ts_triple.timestamp <= current_timestamp { - current_triples.insert(ts_triple.triple.clone()); + // Extend object_var + if let Some(existing_object) = extended_result.get(object_var) { + if existing_object != &object { + continue; // Inconsistent variable binding + } + } else { + extended_result.insert(object_var, object.to_string()); } + local_results.push(extended_result); + } } + } - old_triples.difference(¤t_triples).cloned().collect() - } - - pub fn rstream(&self, start: u64, end: u64) -> Vec { - let mut current_triples = BTreeSet::new(); - - for ts_triple in &self.streams { - if ts_triple.timestamp >= start && ts_triple.timestamp <= end { - current_triples.insert(ts_triple.triple.clone()); - } - } + // Push local results to the shared results vector + let mut global_results = results.lock().unwrap(); + global_results.extend(local_results); + }); - current_triples.into_iter().collect() - } + results.into_inner().unwrap() + } - pub fn set_sliding_window(&mut self, width: u64, slide: u64) { - self.sliding_window = Some(SlidingWindow::new(width, slide)); + pub fn perform_join_par_simd_with_strict_filter_2<'a>( + &self, + subject_var: &'a str, + predicate: String, + object_var: &'a str, + triples: Vec, + dictionary: &'a Dictionary, + final_results: Vec>, + literal_filter: Option, + ) -> Vec> { + if final_results.is_empty() { + return Vec::new(); } - pub fn evaluate_sliding_window(&mut self) -> Vec { - if let Some(window) = &self.sliding_window { - let current_time = current_timestamp(); - let start_time = if current_time > window.width { - current_time - window.width - } else { - 0 - }; + let predicate_bytes = predicate.as_bytes(); + let literal_filter_bytes = literal_filter.as_ref().map(|s| s.as_bytes()); - let result = self.rstream(start_time, current_time); + // Partition final_results into groups based on variable bindings. + let mut both_vars_bound: HashMap<(String, String), Vec>> = HashMap::new(); + let mut subject_var_bound: HashMap>> = HashMap::new(); + let mut object_var_bound: HashMap>> = HashMap::new(); + let mut neither_var_bound: Vec> = Vec::new(); - // Update last evaluated time - self.sliding_window.as_mut().unwrap().last_evaluated = current_time; + for result in final_results { + let subject_binding = result.get(subject_var).cloned(); + let object_binding = result.get(object_var).cloned(); - result - } else { - Vec::new() + match (subject_binding, object_binding) { + (Some(subj_val), Some(obj_val)) => { + both_vars_bound + .entry((subj_val.clone(), obj_val.clone())) + .or_default() + .push(result); } - } - - pub fn window_close_policy(&mut self) -> Vec { - let mut result = vec![]; - if let Some(window) = &self.sliding_window { - let current_time = current_timestamp(); - if current_time >= window.last_evaluated + window.slide { - result = self.evaluate_sliding_window(); - } + (Some(subj_val), None) => { + subject_var_bound.entry(subj_val.clone()).or_default().push(result); } - result - } - - pub fn content_change_policy(&mut self) -> Vec { - let mut _result = vec![]; - let initial_state: BTreeSet<_> = self.triples.clone(); - if let Some(_window) = &self.sliding_window { - _result = self.evaluate_sliding_window(); - let current_state: BTreeSet<_> = self.triples.clone(); - if initial_state != current_state { - return _result; - } + (None, Some(obj_val)) => { + object_var_bound.entry(obj_val.clone()).or_default().push(result); } - vec![] - } - - pub fn non_empty_content_policy(&mut self) -> Vec { - let result = self.evaluate_sliding_window(); - if !result.is_empty() { - return result; + (None, None) => { + neither_var_bound.push(result); } - vec![] + } } - pub fn periodic_policy(&mut self, interval: std::time::Duration) -> Vec { - let mut result = vec![]; - if let Some(window) = &self.sliding_window { - let current_time = current_timestamp(); - if current_time >= window.last_evaluated + interval.as_secs() { - result = self.evaluate_sliding_window(); + // Pre-allocate output vector. + let results = Mutex::new(Vec::new()); + + // Using Rayon for parallel processing. + triples.par_chunks(256).for_each(|chunk| { + let mut local_results = Vec::new(); + + for triple in chunk { + if let (Some(subject), Some(pred), Some(object)) = ( + dictionary.decode(triple.subject), + dictionary.decode(triple.predicate), + dictionary.decode(triple.object), + ) { + // SIMD predicate comparison using simd_eq. + if !unsafe { simd_eq(pred.as_bytes(), predicate_bytes) } { + continue; + } + + // SIMD literal filter comparison. + if let Some(filter_bytes) = literal_filter_bytes { + if !unsafe { simd_eq(object.as_bytes(), filter_bytes) } { + continue; + } + } + + // Process group both_vars_bound. + { + let key = (subject.to_string(), object.to_string()); + if let Some(results_vec) = both_vars_bound.get(&key) { + for result in results_vec { + local_results.push(result.clone()); + } + } + } + + // Process group subject_var_bound. + { + if let Some(results_vec) = subject_var_bound.get(subject) { + for result in results_vec { + let mut extended_result = result.clone(); + // Extend object_var. + if let Some(existing_object) = extended_result.get(object_var) { + if existing_object != &object { + continue; // Inconsistent variable binding. + } + } else { + extended_result.insert(object_var, object.to_string()); + } + local_results.push(extended_result); + } + } + } + + // Process group object_var_bound. + { + if let Some(results_vec) = object_var_bound.get(object) { + for result in results_vec { + let mut extended_result = result.clone(); + // Extend subject_var. + if let Some(existing_subject) = extended_result.get(subject_var) { + if existing_subject != &subject { + continue; // Inconsistent variable binding. + } + } else { + extended_result.insert(subject_var, subject.to_string()); + } + local_results.push(extended_result); + } + } + } + + // Process group neither_var_bound. + for result in &neither_var_bound { + let mut extended_result = result.clone(); + // Extend subject_var. + if let Some(existing_subject) = extended_result.get(subject_var) { + if existing_subject != &subject { + continue; // Inconsistent variable binding. + } + } else { + extended_result.insert(subject_var, subject.to_string()); } + // Extend object_var. + if let Some(existing_object) = extended_result.get(object_var) { + if existing_object != &object { + continue; // Inconsistent variable binding. + } + } else { + extended_result.insert(object_var, object.to_string()); + } + local_results.push(extended_result); + } } - result - } + } - pub fn auto_policy_evaluation(&mut self) -> Vec { - let current_time = current_timestamp(); - let mut result = vec![]; + // Push local results to the shared results vector. + let mut global_results = results.lock().unwrap(); + global_results.extend(local_results); + }); - if let Some(window) = &self.sliding_window { - if current_time >= window.last_evaluated + window.slide { - println!("Window Close Policy"); - result.extend(self.evaluate_sliding_window()); - } + results.into_inner().unwrap() + } + + pub fn perform_join_sequential<'a>( + &self, + subject_var: &'a str, + predicate: String, + object_var: &'a str, + triples: Vec, + dictionary: &'a Dictionary, + final_results: Vec>, + literal_filter: Option, + ) -> Vec> { + if final_results.is_empty() { + return Vec::new(); + } + + let predicate_bytes = predicate.as_bytes(); + let literal_filter_bytes = literal_filter.as_ref().map(|s| s.as_bytes()); + + // Partition final_results into groups based on variable bindings. + let mut both_vars_bound: HashMap<(String, String), Vec>> = + HashMap::new(); + let mut subject_var_bound: HashMap>> = HashMap::new(); + let mut object_var_bound: HashMap>> = HashMap::new(); + let mut neither_var_bound: Vec> = Vec::new(); + + for result in final_results { + let subject_binding = result.get(subject_var).cloned(); + let object_binding = result.get(object_var).cloned(); + + match (subject_binding, object_binding) { + (Some(subj_val), Some(obj_val)) => { + both_vars_bound + .entry((subj_val.clone(), obj_val.clone())) + .or_default() + .push(result); + } + (Some(subj_val), None) => { + subject_var_bound + .entry(subj_val.clone()) + .or_default() + .push(result); + } + (None, Some(obj_val)) => { + object_var_bound + .entry(obj_val.clone()) + .or_default() + .push(result); + } + (None, None) => { + neither_var_bound.push(result); + } + } + } + + let mut results = Vec::new(); + + // Process triples sequentially. + for triple in triples { + if let (Some(subject), Some(pred), Some(object)) = ( + dictionary.decode(triple.subject), + dictionary.decode(triple.predicate), + dictionary.decode(triple.object), + ) { + // Check if the predicate matches. + if pred.as_bytes() != predicate_bytes { + continue; + } + + // Check the literal filter if provided. + if let Some(filter_bytes) = literal_filter_bytes { + if object.as_bytes() != filter_bytes { + continue; + } } - let initial_state: BTreeSet<_> = self.triples.clone(); - if let Some(_window) = &self.sliding_window { - let current_state: BTreeSet<_> = self.triples.clone(); - if initial_state != current_state { - println!("Content Change Policy"); - result.extend(self.evaluate_sliding_window()); + // Process group where both variables are already bound. + { + let key = (subject.to_string(), object.to_string()); + if let Some(results_vec) = both_vars_bound.get(&key) { + for result in results_vec { + results.push(result.clone()); } + } } - let non_empty_result = self.evaluate_sliding_window(); - if !non_empty_result.is_empty() { - println!("Non-empty Content Policy"); - result.extend(non_empty_result); + // Process group where only subject_var is bound. + { + if let Some(results_vec) = subject_var_bound.get(subject) { + for result in results_vec { + let mut extended_result = result.clone(); + // Extend the object_var binding. + if let Some(existing_object) = extended_result.get(object_var) { + if existing_object != &object { + continue; // Inconsistent variable binding. + } + } else { + extended_result.insert(object_var, object.to_string()); + } + results.push(extended_result); + } + } } - let interval = std::time::Duration::new(5, 0); - if let Some(window) = &self.sliding_window { - if current_time >= window.last_evaluated + interval.as_secs() { - println!("Periodic Policy"); - result.extend(self.evaluate_sliding_window()); + // Process group where only object_var is bound. + { + if let Some(results_vec) = object_var_bound.get(object) { + for result in results_vec { + let mut extended_result = result.clone(); + // Extend the subject_var binding. + if let Some(existing_subject) = extended_result.get(subject_var) { + if existing_subject != &subject { + continue; // Inconsistent variable binding. + } + } else { + extended_result.insert(subject_var, subject.to_string()); + } + results.push(extended_result); } + } } - result + // Process group where neither variable is bound. + for result in &neither_var_bound { + let mut extended_result = result.clone(); + // Extend the subject_var binding. + if let Some(existing_subject) = extended_result.get(subject_var) { + if existing_subject != &subject { + continue; // Inconsistent variable binding. + } + } else { + extended_result.insert(subject_var, subject.to_string()); + } + // Extend the object_var binding. + if let Some(existing_object) = extended_result.get(object_var) { + if existing_object != &object { + continue; // Inconsistent variable binding. + } + } else { + extended_result.insert(object_var, object.to_string()); + } + results.push(extended_result); + } + } } - pub fn handle_query(&mut self, query: &str) -> String { - // Assume the query string is in a basic format like "subject predicate object" - let parts: Vec<&str> = query.split_whitespace().collect(); + results + } - if parts.len() != 3 { - return "Invalid query format. Expected 'subject predicate object'.".to_string(); + pub fn perform_join_sequential_simd<'a>( + &self, + subject_var: &'a str, + predicate: String, + object_var: &'a str, + triples: Vec, + dictionary: &'a Dictionary, + final_results: Vec>, + literal_filter: Option, + ) -> Vec> { + if final_results.is_empty() { + return Vec::new(); + } + + let predicate_bytes = predicate.as_bytes(); + let literal_filter_bytes = literal_filter.as_ref().map(|s| s.as_bytes()); + + // Partition final_results into groups based on variable bindings. + let mut both_vars_bound: HashMap<(String, String), Vec>> = + HashMap::new(); + let mut subject_var_bound: HashMap>> = HashMap::new(); + let mut object_var_bound: HashMap>> = HashMap::new(); + let mut neither_var_bound: Vec> = Vec::new(); + + for result in final_results { + let subject_binding = result.get(subject_var).cloned(); + let object_binding = result.get(object_var).cloned(); + + match (subject_binding, object_binding) { + (Some(subj_val), Some(obj_val)) => { + both_vars_bound + .entry((subj_val.clone(), obj_val.clone())) + .or_default() + .push(result); + } + (Some(subj_val), None) => { + subject_var_bound + .entry(subj_val.clone()) + .or_default() + .push(result); + } + (None, Some(obj_val)) => { + object_var_bound + .entry(obj_val.clone()) + .or_default() + .push(result); + } + (None, None) => { + neither_var_bound.push(result); + } + } + } + + let mut results = Vec::new(); + + // Process triples sequentially. + for triple in triples { + if let (Some(subject), Some(pred), Some(object)) = ( + dictionary.decode(triple.subject), + dictionary.decode(triple.predicate), + dictionary.decode(triple.object), + ) { + // Use SIMD-based comparison for the predicate. + if !simd_bytes_eq(pred.as_bytes(), predicate_bytes) { + continue; + } + + // Use SIMD-based comparison for the literal filter if provided. + if let Some(filter_bytes) = literal_filter_bytes { + if !simd_bytes_eq(object.as_bytes(), filter_bytes) { + continue; + } } - let subject = parts[0]; - let predicate = parts[1]; - let object = parts[2]; - - let mut dict = self.dictionary.write().unwrap(); - let subject_id = dict.encode(subject); - let predicate_id = dict.encode(predicate); - let object_id = dict.encode(object); - - let mut result = String::new(); - for triple in &self.triples { - if triple.subject == subject_id - && triple.predicate == predicate_id - && triple.object == object_id - { - result.push_str(&format!( - "Subject: {}, Predicate: {}, Object: {}\n", - dict.decode(triple.subject).unwrap(), - dict.decode(triple.predicate).unwrap(), - dict.decode(triple.object).unwrap() - )); + // Process group where both variables are already bound. + { + let key = (subject.to_string(), object.to_string()); + if let Some(results_vec) = both_vars_bound.get(&key) { + for result in results_vec { + results.push(result.clone()); } + } } - drop(dict); - if result.is_empty() { - result = "No matching triples found.".to_string(); + // Process group where only subject_var is bound. + { + if let Some(results_vec) = subject_var_bound.get(subject) { + for result in results_vec { + let mut extended_result = result.clone(); + // Extend the object_var binding. + if let Some(existing_object) = extended_result.get(object_var) { + if existing_object != &object { + continue; // Inconsistent variable binding. + } + } else { + extended_result.insert(object_var, object.to_string()); + } + results.push(extended_result); + } + } } - result - } - - pub fn handle_update(&mut self, update: &str) -> String { - // Parse the SPARQL update and apply changes to the database - if update.starts_with("INSERT") { - // Extract the part between curly braces - if let Some(start) = update.find('{') { - if let Some(end) = update.find('}') { - let triple_str = &update[start + 1..end].trim(); - let parts: Vec<&str> = triple_str.split_whitespace().collect(); - - if parts.len() == 3 { - let subject = parts[0].to_string(); - let predicate = parts[1].to_string(); - let object = parts[2].to_string(); - - let mut dict = self.dictionary.write().unwrap(); - let triple = Triple { - subject: dict.encode(&subject), - predicate: dict.encode(&predicate), - object: dict.encode(&object), - }; - drop(dict); - self.triples.insert(triple); - return "Update Successful".to_string(); - } - } - } - } else if update.starts_with("DELETE") { - // Extract the part between curly braces - if let Some(start) = update.find('{') { - if let Some(end) = update.find('}') { - let triple_str = &update[start + 1..end].trim(); - let parts: Vec<&str> = triple_str.split_whitespace().collect(); - - if parts.len() == 3 { - let subject = parts[0].to_string(); - let predicate = parts[1].to_string(); - let object = parts[2].to_string(); - - let mut dict = self.dictionary.write().unwrap(); - let triple = Triple { - subject: dict.encode(&subject), - predicate: dict.encode(&predicate), - object: dict.encode(&object), - }; - drop(dict); - self.triples.remove(&triple); - return "Update Successful".to_string(); - } - } + // Process group where only object_var is bound. + { + if let Some(results_vec) = object_var_bound.get(object) { + for result in results_vec { + let mut extended_result = result.clone(); + // Extend the subject_var binding. + if let Some(existing_subject) = extended_result.get(subject_var) { + if existing_subject != &subject { + continue; // Inconsistent variable binding. + } + } else { + extended_result.insert(subject_var, subject.to_string()); + } + results.push(extended_result); } + } } - "Update Failed".to_string() - } - - pub fn handle_http_request(&mut self, request: &str) -> String { - let mut headers = [httparse::EMPTY_HEADER; 16]; - let mut req = httparse::Request::new(&mut headers); - req.parse(request.as_bytes()).unwrap(); - match req.method.unwrap() { - "GET" => { - let url = Url::parse(&("http://localhost".to_owned() + req.path.unwrap())).unwrap(); - let query_pairs: HashMap<_, _> = url.query_pairs().into_owned().collect(); - if let Some(query) = query_pairs.get("query") { - return self.handle_query(query); - } + // Process group where neither variable is bound. + for result in &neither_var_bound { + let mut extended_result = result.clone(); + // Extend the subject_var binding. + if let Some(existing_subject) = extended_result.get(subject_var) { + if existing_subject != &subject { + continue; // Inconsistent variable binding. } - "POST" => { - let content_type = req - .headers - .iter() - .find(|header| header.name.eq_ignore_ascii_case("Content-Type")) - .map(|header| header.value); - - if let Some(content_type) = content_type { - if content_type == b"application/sparql-query" { - // Direct POST query - if let Some(body) = request.split("\r\n\r\n").nth(1) { - return self.handle_query(body); - } - } else if content_type == b"application/x-www-form-urlencoded" { - // URL-encoded POST query or update - if let Some(body) = request.split("\r\n\r\n").nth(1) { - let body_decoded = - percent_decode(body.as_bytes()).decode_utf8().unwrap(); - let params: HashMap<_, _> = body_decoded - .split('&') - .map(|pair| { - let mut split = pair.split('='); - ( - split.next().unwrap().to_string(), - split.next().unwrap_or("").to_string(), - ) - }) - .collect(); - - if let Some(query) = params.get("query") { - return self.handle_query(query); - } else if let Some(update) = params.get("update") { - return self.handle_update(update); - } - } - } else if content_type == b"application/sparql-update" { - // Direct POST update - if let Some(body) = request.split("\r\n\r\n").nth(1) { - return self.handle_update(body); - } - } - } + } else { + extended_result.insert(subject_var, subject.to_string()); + } + // Extend the object_var binding. + if let Some(existing_object) = extended_result.get(object_var) { + if existing_object != &object { + continue; // Inconsistent variable binding. } - _ => {} - } - - "Bad Request".to_string() - } - - pub fn debug_print_triples(&self) { - let dict = self.dictionary.read().unwrap(); - for triple in &self.triples { - println!( - "Stored Triple -> Subject: {}, Predicate: {}, Object: {}", - dict.decode(triple.subject).unwrap(), - dict.decode(triple.predicate).unwrap(), - dict.decode(triple.object).unwrap() - ); + } else { + extended_result.insert(object_var, object.to_string()); + } + results.push(extended_result); } + } } - #[cfg(feature = "cuda")] - pub fn perform_hash_join_cuda_wrapper<'a>( - &self, - subject_var: &'a str, - predicate: String, - object_var: &'a str, - triples: Vec, - dictionary: &'a Dictionary, - final_results: Vec>, - literal_filter: Option, - ) -> Vec> { - if final_results.is_empty() { - return Vec::new(); - } - - // Prepare data for CUDA - let subjects: Vec = triples.iter().map(|t| t.subject).collect(); - let predicates: Vec = triples.iter().map(|t| t.predicate).collect(); - let objects: Vec = triples.iter().map(|t| t.object).collect(); + results + } - let predicate_filter = dictionary.clone().encode(&predicate); + pub fn perform_join_par_simd_with_strict_filter_3<'a>( + &self, + subject_var: &'a str, + predicate: String, + object_var: &'a str, + triples: Vec, + dictionary: &'a Dictionary, + final_results: Vec>, + literal_filter: Option, + ) -> Vec> { + // Early return for empty joins + if final_results.is_empty() { + return Vec::new(); + } + + // Pre-fetch predicate and filter bytes to avoid string comparisons + let predicate_bytes = predicate.as_bytes(); + let literal_filter_bytes = literal_filter.as_ref().map(|s| s.as_bytes()); + + // Preallocate with capacity estimation to avoid rehashing + let estimated_capacity = (final_results.len() / 4).max(HASHMAP_INITIAL_CAPACITY); + + // Use with_capacity to preallocate hashmap space + let mut both_vars_bound: HashMap<(String, String), Vec> = + HashMap::with_capacity(estimated_capacity); + let mut subject_var_bound: HashMap> = + HashMap::with_capacity(estimated_capacity); + let mut object_var_bound: HashMap> = + HashMap::with_capacity(estimated_capacity); + let mut neither_var_bound: Vec = Vec::with_capacity(final_results.len() / 2); + + // Pre-compute and classify bindings - this is serial but much faster than doing it in parallel + for (idx, result) in final_results.iter().enumerate() { + let subject_binding = result.get(subject_var); + let object_binding = result.get(object_var); + + match (subject_binding, object_binding) { + (Some(subj_val), Some(obj_val)) => { + both_vars_bound + .entry((subj_val.clone(), obj_val.clone())) + .or_insert_with(|| Vec::with_capacity(4)) + .push(idx); + } + (Some(subj_val), None) => { + subject_var_bound + .entry(subj_val.clone()) + .or_insert_with(|| Vec::with_capacity(8)) + .push(idx); + } + (None, Some(obj_val)) => { + object_var_bound + .entry(obj_val.clone()) + .or_insert_with(|| Vec::with_capacity(8)) + .push(idx); + } + (None, None) => { + neither_var_bound.push(idx); + } + } + } + + // Immutable shared references for threading + let final_results_arc = Arc::new(final_results); + let both_vars_bound_arc = Arc::new(both_vars_bound); + let subject_var_bound_arc = Arc::new(subject_var_bound); + let object_var_bound_arc = Arc::new(object_var_bound); + let neither_var_bound_arc = Arc::new(neither_var_bound); + + // Calculate optimal chunk size based on available processors and dataset size + let chunk_size = (triples.len() / rayon::current_num_threads()).max(MIN_CHUNK_SIZE); + + // Process triples in chunks for better cache locality and load balancing + let results = triples + .par_chunks(chunk_size) + .flat_map(|triple_chunk| { + // Preallocate result vector for this chunk based on estimated hit rate + let mut local_results = Vec::with_capacity(triple_chunk.len() / 4); + + // Process each triple in the chunk + for triple in triple_chunk { + // Step 1: Quick predicate check first (early filter) + let pred_opt = dictionary.decode(triple.predicate); + if pred_opt.is_none() || pred_opt.as_ref().unwrap().as_bytes() != predicate_bytes { + continue; + } - let literal_filter_value = literal_filter - .as_ref() - .map(|lit| dictionary.clone().encode(lit)) - .unwrap_or(0); + // Step 2: Filter check if needed + if let Some(filter_bytes) = &literal_filter_bytes { + let obj_opt = dictionary.decode(triple.object); + if obj_opt.is_none() || obj_opt.as_ref().unwrap().as_bytes() != *filter_bytes { + continue; + } - let literal_filter_option = if literal_filter.is_some() { - Some(literal_filter_value) - } else { - None - }; + // Decode subject only if predicate and object pass filters + if let Some(subj) = dictionary.decode(triple.subject) { + process_join( + &subj, + obj_opt.unwrap(), + subject_var, + object_var, + &both_vars_bound_arc, + &subject_var_bound_arc, + &object_var_bound_arc, + &neither_var_bound_arc, + &final_results_arc, + &mut local_results, + ); + } + } else { + // No filter - decode both subject and object + let subj_opt = dictionary.decode(triple.subject); + let obj_opt = dictionary.decode(triple.object); - // Call CUDA function - let matching_indices = hash_join_cuda( - &subjects, - &predicates, - &objects, - predicate_filter, - literal_filter_option, - ); + if let (Some(subj), Some(obj)) = (subj_opt, obj_opt) { + process_join( + &subj, + &obj, + subject_var, + object_var, + &both_vars_bound_arc, + &subject_var_bound_arc, + &object_var_bound_arc, + &neither_var_bound_arc, + &final_results_arc, + &mut local_results, + ); + } + } + } + + local_results + }) + .collect(); + + results + } + + pub fn perform_join_par_simd_with_strict_filter_4<'a>( + &self, + subject_var: &'a str, + predicate: String, + object_var: &'a str, + triples: Vec, + dictionary: &'a Dictionary, + final_results: Vec>, + literal_filter: Option, + ) -> Vec> { + // Early return for empty joins + if final_results.is_empty() { + return Vec::new(); + } + + // Pre-fetch predicate and filter bytes to avoid string comparisons + let predicate_bytes = predicate.as_bytes(); + let literal_filter_bytes = literal_filter.as_ref().map(|s| s.as_bytes()); + + let estimated_capacity = (final_results.len() / 3).max(HASHMAP_INITIAL_CAPACITY1); + + let mut both_vars_bound: HashMap<(String, String), Vec> = + HashMap::with_capacity(estimated_capacity / 2); // This tends to be smaller + let mut subject_var_bound: HashMap> = + HashMap::with_capacity(estimated_capacity); + let mut object_var_bound: HashMap> = + HashMap::with_capacity(estimated_capacity); + let mut neither_var_bound: Vec = Vec::with_capacity(final_results.len() / 2); + + // Pre-compute and classify bindings - this is serial but much faster than doing it in parallel + for (idx, result) in final_results.iter().enumerate() { + let subject_binding = result.get(subject_var); + let object_binding = result.get(object_var); + + match (subject_binding, object_binding) { + (Some(subj_val), Some(obj_val)) => { + both_vars_bound + .entry((subj_val.clone(), obj_val.clone())) + .or_insert_with(|| Vec::with_capacity(4)) + .push(idx); + } + (Some(subj_val), None) => { + subject_var_bound + .entry(subj_val.clone()) + .or_insert_with(|| Vec::with_capacity(8)) + .push(idx); + } + (None, Some(obj_val)) => { + object_var_bound + .entry(obj_val.clone()) + .or_insert_with(|| Vec::with_capacity(8)) + .push(idx); + } + (None, None) => { + neither_var_bound.push(idx); + } + } + } + + // Immutable shared references for threading + let final_results_arc = Arc::new(final_results); + let both_vars_bound_arc = Arc::new(both_vars_bound); + let subject_var_bound_arc = Arc::new(subject_var_bound); + let object_var_bound_arc = Arc::new(object_var_bound); + let neither_var_bound_arc = Arc::new(neither_var_bound); + + let chunk_size = ((triples.len() / rayon::current_num_threads()) * 3 / 2).max(MIN_CHUNK_SIZE1); + + let results = triples + .par_chunks(chunk_size) + .fold( + || Vec::with_capacity(chunk_size / 4), // Local vector capacity based on chunk size + |mut local_results, triple_chunk| { + // Create a local result buffer + process_triple_chunk( + triple_chunk, + predicate_bytes, + &literal_filter_bytes, + subject_var, + object_var, + &both_vars_bound_arc, + &subject_var_bound_arc, + &object_var_bound_arc, + &neither_var_bound_arc, + &final_results_arc, + &mut local_results, + dictionary, + ); + + local_results + }, + ) + .reduce( + || Vec::new(), + |mut acc, mut chunk| { + if acc.is_empty() { + return chunk; + } + if chunk.is_empty() { + return acc; + } + + // Pre-allocate to avoid reallocation during append + if acc.capacity() < acc.len() + chunk.len() { + acc.reserve(chunk.len()); + } + acc.append(&mut chunk); + acc + }, + ); + + results + } + + pub fn istream(&self, last_timestamp: u64) -> Vec { + let mut new_triples = vec![]; + for ts_triple in &self.streams { + if ts_triple.timestamp > last_timestamp { + new_triples.push(ts_triple.triple.clone()); + } + } + new_triples + } + + pub fn dstream(&self, last_timestamp: u64, current_timestamp: u64) -> Vec { + let mut old_triples = BTreeSet::new(); + let mut current_triples = BTreeSet::new(); + + for ts_triple in &self.streams { + if ts_triple.timestamp <= last_timestamp { + old_triples.insert(ts_triple.triple.clone()); + } + if ts_triple.timestamp <= current_timestamp { + current_triples.insert(ts_triple.triple.clone()); + } + } + + old_triples.difference(¤t_triples).cloned().collect() + } + + pub fn rstream(&self, start: u64, end: u64) -> Vec { + let mut current_triples = BTreeSet::new(); + + for ts_triple in &self.streams { + if ts_triple.timestamp >= start && ts_triple.timestamp <= end { + current_triples.insert(ts_triple.triple.clone()); + } + } + + current_triples.into_iter().collect() + } + + pub fn set_sliding_window(&mut self, width: u64, slide: u64) { + self.sliding_window = Some(SlidingWindow::new(width, slide)); + } + + pub fn evaluate_sliding_window(&mut self) -> Vec { + if let Some(window) = &self.sliding_window { + let current_time = current_timestamp(); + let start_time = if current_time > window.width { + current_time - window.width + } else { + 0 + }; + + let result = self.rstream(start_time, current_time); + + // Update last evaluated time + self.sliding_window.as_mut().unwrap().last_evaluated = current_time; + + result + } else { + Vec::new() + } + } + + pub fn window_close_policy(&mut self) -> Vec { + let mut result = vec![]; + if let Some(window) = &self.sliding_window { + let current_time = current_timestamp(); + if current_time >= window.last_evaluated + window.slide { + result = self.evaluate_sliding_window(); + } + } + result + } + + pub fn content_change_policy(&mut self) -> Vec { + let mut _result = vec![]; + let initial_state: BTreeSet<_> = self.triples.clone(); + if let Some(_window) = &self.sliding_window { + _result = self.evaluate_sliding_window(); + let current_state: BTreeSet<_> = self.triples.clone(); + if initial_state != current_state { + return _result; + } + } + vec![] + } + + pub fn non_empty_content_policy(&mut self) -> Vec { + let result = self.evaluate_sliding_window(); + if !result.is_empty() { + return result; + } + vec![] + } + + pub fn periodic_policy(&mut self, interval: std::time::Duration) -> Vec { + let mut result = vec![]; + if let Some(window) = &self.sliding_window { + let current_time = current_timestamp(); + if current_time >= window.last_evaluated + interval.as_secs() { + result = self.evaluate_sliding_window(); + } + } + result + } + + pub fn auto_policy_evaluation(&mut self) -> Vec { + let current_time = current_timestamp(); + let mut result = vec![]; + + if let Some(window) = &self.sliding_window { + if current_time >= window.last_evaluated + window.slide { + println!("Window Close Policy"); + result.extend(self.evaluate_sliding_window()); + } + } + + let initial_state: BTreeSet<_> = self.triples.clone(); + if let Some(_window) = &self.sliding_window { + let current_state: BTreeSet<_> = self.triples.clone(); + if initial_state != current_state { + println!("Content Change Policy"); + result.extend(self.evaluate_sliding_window()); + } + } + + let non_empty_result = self.evaluate_sliding_window(); + if !non_empty_result.is_empty() { + println!("Non-empty Content Policy"); + result.extend(non_empty_result); + } + + let interval = std::time::Duration::new(5, 0); + if let Some(window) = &self.sliding_window { + if current_time >= window.last_evaluated + interval.as_secs() { + println!("Periodic Policy"); + result.extend(self.evaluate_sliding_window()); + } + } + + result + } + + pub fn handle_query(&mut self, query: &str) -> String { + // Assume the query string is in a basic format like "subject predicate object" + let parts: Vec<&str> = query.split_whitespace().collect(); + + if parts.len() != 3 { + return "Invalid query format. Expected 'subject predicate object'.".to_string(); + } + + let subject = parts[0]; + let predicate = parts[1]; + let object = parts[2]; + + let mut dict = self.dictionary.write().unwrap(); + let subject_id = dict.encode(subject); + let predicate_id = dict.encode(predicate); + let object_id = dict.encode(object); + + let mut result = String::new(); + for triple in &self.triples { + if triple.subject == subject_id + && triple.predicate == predicate_id + && triple.object == object_id + { + result.push_str(&format!( + "Subject: {}, Predicate: {}, Object: {}\n", + dict.decode(triple.subject).unwrap(), + dict.decode(triple.predicate).unwrap(), + dict.decode(triple.object).unwrap() + )); + } + } + drop(dict); + + if result.is_empty() { + result = "No matching triples found.".to_string(); + } + + result + } + + pub fn handle_update(&mut self, update: &str) -> String { + // Parse the SPARQL update and apply changes to the database + if update.starts_with("INSERT") { + // Extract the part between curly braces + if let Some(start) = update.find('{') { + if let Some(end) = update.find('}') { + let triple_str = &update[start + 1..end].trim(); + let parts: Vec<&str> = triple_str.split_whitespace().collect(); + + if parts.len() == 3 { + let subject = parts[0].to_string(); + let predicate = parts[1].to_string(); + let object = parts[2].to_string(); + + let mut dict = self.dictionary.write().unwrap(); + let triple = Triple { + subject: dict.encode(&subject), + predicate: dict.encode(&predicate), + object: dict.encode(&object), + }; + drop(dict); + self.triples.insert(triple); + return "Update Successful".to_string(); + } + } + } + } else if update.starts_with("DELETE") { + // Extract the part between curly braces + if let Some(start) = update.find('{') { + if let Some(end) = update.find('}') { + let triple_str = &update[start + 1..end].trim(); + let parts: Vec<&str> = triple_str.split_whitespace().collect(); + + if parts.len() == 3 { + let subject = parts[0].to_string(); + let predicate = parts[1].to_string(); + let object = parts[2].to_string(); + + let mut dict = self.dictionary.write().unwrap(); + let triple = Triple { + subject: dict.encode(&subject), + predicate: dict.encode(&predicate), + object: dict.encode(&object), + }; + drop(dict); + self.triples.remove(&triple); + return "Update Successful".to_string(); + } + } + } + } + "Update Failed".to_string() + } + + pub fn handle_http_request(&mut self, request: &str) -> String { + let mut headers = [httparse::EMPTY_HEADER; 16]; + let mut req = httparse::Request::new(&mut headers); + req.parse(request.as_bytes()).unwrap(); + + match req.method.unwrap() { + "GET" => { + let url = Url::parse(&("http://localhost".to_owned() + req.path.unwrap())).unwrap(); + let query_pairs: HashMap<_, _> = url.query_pairs().into_owned().collect(); + if let Some(query) = query_pairs.get("query") { + return self.handle_query(query); + } + } + "POST" => { + let content_type = req + .headers + .iter() + .find(|header| header.name.eq_ignore_ascii_case("Content-Type")) + .map(|header| header.value); + + if let Some(content_type) = content_type { + if content_type == b"application/sparql-query" { + // Direct POST query + if let Some(body) = request.split("\r\n\r\n").nth(1) { + return self.handle_query(body); + } + } else if content_type == b"application/x-www-form-urlencoded" { + // URL-encoded POST query or update + if let Some(body) = request.split("\r\n\r\n").nth(1) { + let body_decoded = + percent_decode(body.as_bytes()).decode_utf8().unwrap(); + let params: HashMap<_, _> = body_decoded + .split('&') + .map(|pair| { + let mut split = pair.split('='); + ( + split.next().unwrap().to_string(), + split.next().unwrap_or("").to_string(), + ) + }) + .collect(); - // Prepare variable bindings - let mut both_vars_bound: HashMap<(String, String), Vec>> = - HashMap::new(); - let mut subject_var_bound: HashMap>> = HashMap::new(); - let mut object_var_bound: HashMap>> = HashMap::new(); - let mut neither_var_bound: Vec> = Vec::new(); - - for result in final_results { - let subject_binding = result.get(subject_var).cloned(); - let object_binding = result.get(object_var).cloned(); - - match (subject_binding, object_binding) { - (Some(subj_val), Some(obj_val)) => { - both_vars_bound - .entry((subj_val.clone(), obj_val.clone())) - .or_default() - .push(result); - } - (Some(subj_val), None) => { - subject_var_bound - .entry(subj_val.clone()) - .or_default() - .push(result); - } - (None, Some(obj_val)) => { - object_var_bound - .entry(obj_val.clone()) - .or_default() - .push(result); - } - (None, None) => { - neither_var_bound.push(result); - } + if let Some(query) = params.get("query") { + return self.handle_query(query); + } else if let Some(update) = params.get("update") { + return self.handle_update(update); + } + } + } else if content_type == b"application/sparql-update" { + // Direct POST update + if let Some(body) = request.split("\r\n\r\n").nth(1) { + return self.handle_update(body); } + } } + } + _ => {} + } - // Reconstruct results - let mut results = Vec::new(); - - for idx in matching_indices { - let triple = &triples[idx as usize]; - - if let (Some(subject), Some(object)) = ( - dictionary.decode(triple.subject), - dictionary.decode(triple.object), - ) { - // Process group both_vars_bound - { - let key = (subject.to_string(), object.to_string()); - if let Some(results_vec) = both_vars_bound.get(&key) { - for result in results_vec { - let extended_result = result.clone(); - results.push(extended_result); - } - } - } - - // Process group subject_var_bound - { - if let Some(results_vec) = subject_var_bound.get(subject) { - for result in results_vec { - let mut extended_result = result.clone(); - // Extend object_var - if let Some(existing_object) = extended_result.get(object_var) { - if existing_object != &object { - continue; // Inconsistent variable binding - } - } else { - extended_result.insert(object_var, object.to_string()); - } - results.push(extended_result); - } - } - } + "Bad Request".to_string() + } - // Process group object_var_bound - { - if let Some(results_vec) = object_var_bound.get(object) { - for result in results_vec { - let mut extended_result = result.clone(); - // Extend subject_var - if let Some(existing_subject) = extended_result.get(subject_var) { - if existing_subject != &subject { - continue; // Inconsistent variable binding - } - } else { - extended_result.insert(subject_var, subject.to_string()); - } - results.push(extended_result); - } - } - } + pub fn debug_print_triples(&self) { + let dict = self.dictionary.read().unwrap(); + for triple in &self.triples { + println!( + "Stored Triple -> Subject: {}, Predicate: {}, Object: {}", + dict.decode(triple.subject).unwrap(), + dict.decode(triple.predicate).unwrap(), + dict.decode(triple.object).unwrap() + ); + } + } - // Process group neither_var_bound - for result in &neither_var_bound { - let mut extended_result = result.clone(); - // Extend subject_var - if let Some(existing_subject) = extended_result.get(subject_var) { - if existing_subject != &subject { - continue; // Inconsistent variable binding - } - } else { - extended_result.insert(subject_var, subject.to_string()); - } - // Extend object_var - if let Some(existing_object) = extended_result.get(object_var) { - if existing_object != &object { - continue; // Inconsistent variable binding - } - } else { - extended_result.insert(object_var, object.to_string()); - } - results.push(extended_result); - } + #[cfg(feature = "cuda")] + pub fn perform_hash_join_cuda_wrapper<'a>( + &self, + subject_var: &'a str, + predicate: String, + object_var: &'a str, + triples: Vec, + dictionary: &'a Dictionary, + final_results: Vec>, + literal_filter: Option, + ) -> Vec> { + if final_results.is_empty() { + return Vec::new(); + } + + // Prepare data for CUDA + let subjects: Vec = triples.iter().map(|t| t.subject).collect(); + let predicates: Vec = triples.iter().map(|t| t.predicate).collect(); + let objects: Vec = triples.iter().map(|t| t.object).collect(); + + let predicate_filter = dictionary.clone().encode(&predicate); + + let literal_filter_value = literal_filter + .as_ref() + .map(|lit| dictionary.clone().encode(lit)) + .unwrap_or(0); + + let literal_filter_option = if literal_filter.is_some() { + Some(literal_filter_value) + } else { + None + }; + + // Call CUDA function + let matching_indices = hash_join_cuda( + &subjects, + &predicates, + &objects, + predicate_filter, + literal_filter_option, + ); + + // Prepare variable bindings + let mut both_vars_bound: HashMap<(String, String), Vec>> = + HashMap::new(); + let mut subject_var_bound: HashMap>> = HashMap::new(); + let mut object_var_bound: HashMap>> = HashMap::new(); + let mut neither_var_bound: Vec> = Vec::new(); + + for result in final_results { + let subject_binding = result.get(subject_var).cloned(); + let object_binding = result.get(object_var).cloned(); + + match (subject_binding, object_binding) { + (Some(subj_val), Some(obj_val)) => { + both_vars_bound + .entry((subj_val.clone(), obj_val.clone())) + .or_default() + .push(result); + } + (Some(subj_val), None) => { + subject_var_bound + .entry(subj_val.clone()) + .or_default() + .push(result); + } + (None, Some(obj_val)) => { + object_var_bound + .entry(obj_val.clone()) + .or_default() + .push(result); + } + (None, None) => { + neither_var_bound.push(result); + } + } + } + + // Reconstruct results + let mut results = Vec::new(); + + for idx in matching_indices { + let triple = &triples[idx as usize]; + + if let (Some(subject), Some(object)) = ( + dictionary.decode(triple.subject), + dictionary.decode(triple.object), + ) { + // Process group both_vars_bound + { + let key = (subject.to_string(), object.to_string()); + if let Some(results_vec) = both_vars_bound.get(&key) { + for result in results_vec { + let extended_result = result.clone(); + results.push(extended_result); } + } } - results - } + // Process group subject_var_bound + { + if let Some(results_vec) = subject_var_bound.get(subject) { + for result in results_vec { + let mut extended_result = result.clone(); + // Extend object_var + if let Some(existing_object) = extended_result.get(object_var) { + if existing_object != &object { + continue; // Inconsistent variable binding + } + } else { + extended_result.insert(object_var, object.to_string()); + } + results.push(extended_result); + } + } + } - // Create user defined function - pub fn register_udf(&mut self, name: &str, f: F) - where - F: Fn(Vec<&str>) -> String + Send + Sync + 'static, - { - self.udfs.insert(name.to_string(), ClonableFn::new(f)); - } - - /// Rebuild all indexes from the current state of `self.triples`. - pub fn build_all_indexes(&mut self) { - // Clear existing indexes - self.index_manager.clear(); - - // Get all triples as a vector for parallel processing - let triples: Vec = self.triples.iter().cloned().collect(); - - self.index_manager.clear(); - self.index_manager.build_from_triples(&triples); - - // Optimize the final merged index - self.index_manager.optimize(); - } - - /// Triple to string - pub fn triple_to_string(&self, triple: &Triple, dict: &Dictionary) -> String { - let subject = dict.decode(triple.subject); - let predicate = dict.decode(triple.predicate); - let object = dict.decode(triple.object); - format!("{} {} {}", subject.unwrap(), predicate.unwrap(), object.unwrap()) - } - - pub fn decode_triple(&self, triple: &Triple) -> Option<(String, String, String)> { - let dict = self.dictionary.read().unwrap(); - let subject = dict.decode(triple.subject)?.to_string(); - let predicate = dict.decode(triple.predicate)?.to_string(); - let object = dict.decode(triple.object)?.to_string(); - drop(dict); - - Some((subject, predicate, object)) - } + // Process group object_var_bound + { + if let Some(results_vec) = object_var_bound.get(object) { + for result in results_vec { + let mut extended_result = result.clone(); + // Extend subject_var + if let Some(existing_subject) = extended_result.get(subject_var) { + if existing_subject != &subject { + continue; // Inconsistent variable binding + } + } else { + extended_result.insert(subject_var, subject.to_string()); + } + results.push(extended_result); + } + } + } + + // Process group neither_var_bound + for result in &neither_var_bound { + let mut extended_result = result.clone(); + // Extend subject_var + if let Some(existing_subject) = extended_result.get(subject_var) { + if existing_subject != &subject { + continue; // Inconsistent variable binding + } + } else { + extended_result.insert(subject_var, subject.to_string()); + } + // Extend object_var + if let Some(existing_object) = extended_result.get(object_var) { + if existing_object != &object { + continue; // Inconsistent variable binding + } + } else { + extended_result.insert(object_var, object.to_string()); + } + results.push(extended_result); + } + } + } + + results + } + + // Create user defined function + pub fn register_udf(&mut self, name: &str, f: F) +where + F: Fn(Vec<&str>) -> String + Send + Sync + 'static, + { + self.udfs.insert(name.to_string(), ClonableFn::new(f)); + } + + /// Triple to string + pub fn triple_to_string(&self, triple: &Triple, dict: &Dictionary) -> String { + let subject = dict.decode(triple.subject); + let predicate = dict.decode(triple.predicate); + let object = dict.decode(triple.object); + format!("{} {} {}", subject.unwrap(), predicate.unwrap(), object.unwrap()) + } + + pub fn decode_triple(&self, triple: &Triple) -> Option<(String, String, String)> { + let dict = self.dictionary.read().unwrap(); + let subject = dict.decode(triple.subject)?.to_string(); + let predicate = dict.decode(triple.predicate)?.to_string(); + let object = dict.decode(triple.object)?.to_string(); + drop(dict); + + Some((subject, predicate, object)) + } } #[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"), target_feature(enable = "sse2"))] #[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon"))] pub unsafe fn simd_eq(a: &[u8], b: &[u8]) -> bool { - if a.len() != b.len() { + if a.len() != b.len() { + return false; + } + + // SSE2 implementation for x86/x86_64 + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + let len = a.len(); + let chunks = len / 16; + let mut i = 0; + while i < chunks * 16 { + let pa = a.as_ptr().add(i) as *const __m128i; + let pb = b.as_ptr().add(i) as *const __m128i; + let va = _mm_loadu_si128(pa); + let vb = _mm_loadu_si128(pb); + let cmp = _mm_cmpeq_epi8(va, vb); + let mask = _mm_movemask_epi8(cmp); + if mask != 0xFFFF { return false; + } + i += 16; } - - // SSE2 implementation for x86/x86_64 - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - { - let len = a.len(); - let chunks = len / 16; - let mut i = 0; - while i < chunks * 16 { - let pa = a.as_ptr().add(i) as *const __m128i; - let pb = b.as_ptr().add(i) as *const __m128i; - let va = _mm_loadu_si128(pa); - let vb = _mm_loadu_si128(pb); - let cmp = _mm_cmpeq_epi8(va, vb); - let mask = _mm_movemask_epi8(cmp); - if mask != 0xFFFF { - return false; - } - i += 16; - } - // Compare any remaining bytes - for j in (chunks * 16)..len { - if a[j] != b[j] { - return false; - } - } - return true; + // Compare any remaining bytes + for j in (chunks * 16)..len { + if a[j] != b[j] { + return false; + } + } + return true; + } + + // NEON implementation for aarch64 + #[cfg(target_arch = "aarch64")] + { + let len = a.len(); + let chunks = len / 16; + let mut i = 0; + while i < chunks * 16 { + let pa = a.as_ptr().add(i); + let pb = b.as_ptr().add(i); + let va = vld1q_u8(pa); + let vb = vld1q_u8(pb); + let cmp = vceqq_u8(va, vb); + let cmp_u64 = vreinterpretq_u64_u8(cmp); + let low = vgetq_lane_u64(cmp_u64, 0); + let high = vgetq_lane_u64(cmp_u64, 1); + if low != u64::MAX || high != u64::MAX { + return false; + } + i += 16; } - - // NEON implementation for aarch64 - #[cfg(target_arch = "aarch64")] - { - let len = a.len(); - let chunks = len / 16; - let mut i = 0; - while i < chunks * 16 { - let pa = a.as_ptr().add(i); - let pb = b.as_ptr().add(i); - let va = vld1q_u8(pa); - let vb = vld1q_u8(pb); - let cmp = vceqq_u8(va, vb); - let cmp_u64 = vreinterpretq_u64_u8(cmp); - let low = vgetq_lane_u64(cmp_u64, 0); - let high = vgetq_lane_u64(cmp_u64, 1); - if low != u64::MAX || high != u64::MAX { - return false; - } - i += 16; - } - // Compare any remaining bytes - for j in (chunks * 16)..len { - if a[j] != b[j] { - return false; - } - } - return true; + // Compare any remaining bytes + for j in (chunks * 16)..len { + if a[j] != b[j] { + return false; + } } + return true; + } - // Fallback for other architectures - #[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))] - { - return a == b; - } + // Fallback for other architectures + #[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))] + { + return a == b; + } } #[inline] fn simd_bytes_eq(a: &[u8], b: &[u8]) -> bool { - if a.len() != b.len() { + if a.len() != b.len() { + return false; + } + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + unsafe { + use std::arch::x86_64::*; + let mut i = 0; + let len = a.len(); + while i + 16 <= len { + let a_chunk = _mm_loadu_si128(a.as_ptr().add(i) as *const __m128i); + let b_chunk = _mm_loadu_si128(b.as_ptr().add(i) as *const __m128i); + let cmp = _mm_cmpeq_epi8(a_chunk, b_chunk); + // If all 16 bytes match, _mm_movemask_epi8 returns 0xFFFF. + if _mm_movemask_epi8(cmp) != 0xFFFF { return false; + } + i += 16; } - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - unsafe { - use std::arch::x86_64::*; - let mut i = 0; - let len = a.len(); - while i + 16 <= len { - let a_chunk = _mm_loadu_si128(a.as_ptr().add(i) as *const __m128i); - let b_chunk = _mm_loadu_si128(b.as_ptr().add(i) as *const __m128i); - let cmp = _mm_cmpeq_epi8(a_chunk, b_chunk); - // If all 16 bytes match, _mm_movemask_epi8 returns 0xFFFF. - if _mm_movemask_epi8(cmp) != 0xFFFF { - return false; - } - i += 16; - } - // Compare any remaining bytes. - for j in i..len { - if a[j] != b[j] { - return false; - } - } - true - } - #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] - { - // Fallback on non-x86 architectures. - a == b - } + // Compare any remaining bytes. + for j in i..len { + if a[j] != b[j] { + return false; + } + } + true + } + #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] + { + // Fallback on non-x86 architectures. + a == b + } } #[inline(always)] fn process_join<'a>( - subject: &str, - object: &str, - subject_var: &'a str, - object_var: &'a str, - both_vars_bound: &Arc>>, - subject_var_bound: &Arc>>, - object_var_bound: &Arc>>, - neither_var_bound: &Arc>, - final_results_arc: &Arc>>, - local_results: &mut Vec>, + subject: &str, + object: &str, + subject_var: &'a str, + object_var: &'a str, + both_vars_bound: &Arc>>, + subject_var_bound: &Arc>>, + object_var_bound: &Arc>>, + neither_var_bound: &Arc>, + final_results_arc: &Arc>>, + local_results: &mut Vec>, ) { - // Check both_vars_bound - most restrictive case first - if let Some(result_indices) = both_vars_bound.get(&(subject.to_string(), object.to_string())) { - for &idx in result_indices { - local_results.push(final_results_arc[idx].clone()); - } - } - - // Process subject_var_bound - if let Some(result_indices) = subject_var_bound.get(subject) { - for &idx in result_indices { - let base_result = &final_results_arc[idx]; - // Check for object consistency if it exists - if let Some(existing_object) = base_result.get(object_var) { - if existing_object == object { - local_results.push(base_result.clone()); - } - } else { - // Bind the object variable - let mut extended_result = base_result.clone(); - extended_result.insert(object_var, object.to_string()); - local_results.push(extended_result); - } - } - } - - // Process object_var_bound - if let Some(result_indices) = object_var_bound.get(object) { - for &idx in result_indices { - let base_result = &final_results_arc[idx]; - // Check for subject consistency if it exists - if let Some(existing_subject) = base_result.get(subject_var) { - if existing_subject == subject { - local_results.push(base_result.clone()); - } - } else { - // Bind the subject variable - let mut extended_result = base_result.clone(); - extended_result.insert(subject_var, subject.to_string()); - local_results.push(extended_result); - } - } - } - - // Process neither_var_bound - least restrictive case last - for &idx in neither_var_bound.iter() { - let base_result = &final_results_arc[idx]; - - // Check both consistency constraints - let subject_consistent = base_result - .get(subject_var) - .map_or(true, |existing| existing == subject); - let object_consistent = base_result - .get(object_var) - .map_or(true, |existing| existing == object); - - if subject_consistent && object_consistent { - let mut extended_result = base_result.clone(); - - // Only insert if not already present - if !base_result.contains_key(subject_var) { - extended_result.insert(subject_var, subject.to_string()); - } - if !base_result.contains_key(object_var) { - extended_result.insert(object_var, object.to_string()); - } - - local_results.push(extended_result); - } - } + // Check both_vars_bound - most restrictive case first + if let Some(result_indices) = both_vars_bound.get(&(subject.to_string(), object.to_string())) { + for &idx in result_indices { + local_results.push(final_results_arc[idx].clone()); + } + } + + // Process subject_var_bound + if let Some(result_indices) = subject_var_bound.get(subject) { + for &idx in result_indices { + let base_result = &final_results_arc[idx]; + // Check for object consistency if it exists + if let Some(existing_object) = base_result.get(object_var) { + if existing_object == object { + local_results.push(base_result.clone()); + } + } else { + // Bind the object variable + let mut extended_result = base_result.clone(); + extended_result.insert(object_var, object.to_string()); + local_results.push(extended_result); + } + } + } + + // Process object_var_bound + if let Some(result_indices) = object_var_bound.get(object) { + for &idx in result_indices { + let base_result = &final_results_arc[idx]; + // Check for subject consistency if it exists + if let Some(existing_subject) = base_result.get(subject_var) { + if existing_subject == subject { + local_results.push(base_result.clone()); + } + } else { + // Bind the subject variable + let mut extended_result = base_result.clone(); + extended_result.insert(subject_var, subject.to_string()); + local_results.push(extended_result); + } + } + } + + // Process neither_var_bound - least restrictive case last + for &idx in neither_var_bound.iter() { + let base_result = &final_results_arc[idx]; + + // Check both consistency constraints + let subject_consistent = base_result + .get(subject_var) + .map_or(true, |existing| existing == subject); + let object_consistent = base_result + .get(object_var) + .map_or(true, |existing| existing == object); + + if subject_consistent && object_consistent { + let mut extended_result = base_result.clone(); + + // Only insert if not already present + if !base_result.contains_key(subject_var) { + extended_result.insert(subject_var, subject.to_string()); + } + if !base_result.contains_key(object_var) { + extended_result.insert(object_var, object.to_string()); + } + + local_results.push(extended_result); + } + } } #[inline(always)] fn process_triple_chunk<'a>( - triple_chunk: &[Triple], - predicate_bytes: &[u8], - literal_filter_bytes: &Option<&[u8]>, - subject_var: &'a str, - object_var: &'a str, - both_vars_bound: &Arc>>, - subject_var_bound: &Arc>>, - object_var_bound: &Arc>>, - neither_var_bound: &Arc>, - final_results_arc: &Arc>>, - local_results: &mut Vec>, - dictionary: &'a Dictionary, + triple_chunk: &[Triple], + predicate_bytes: &[u8], + literal_filter_bytes: &Option<&[u8]>, + subject_var: &'a str, + object_var: &'a str, + both_vars_bound: &Arc>>, + subject_var_bound: &Arc>>, + object_var_bound: &Arc>>, + neither_var_bound: &Arc>, + final_results_arc: &Arc>>, + local_results: &mut Vec>, + dictionary: &'a Dictionary, ) { - // Pre-filter triples to avoid unnecessary decoding - for triple in triple_chunk { - let pred_opt = dictionary.decode(triple.predicate); - if pred_opt.is_none() || pred_opt.as_ref().unwrap().as_bytes() != predicate_bytes { - continue; - } - - if let Some(filter_bytes) = literal_filter_bytes { - let obj_opt = dictionary.decode(triple.object); - if obj_opt.is_none() || obj_opt.as_ref().unwrap().as_bytes() != *filter_bytes { - continue; - } - - if let Some(subj) = dictionary.decode(triple.subject) { - process_join_efficiently( - &subj, - obj_opt.unwrap(), - subject_var, - object_var, - both_vars_bound, - subject_var_bound, - object_var_bound, - neither_var_bound, - final_results_arc, - local_results, - ); - } - } else { - let subj_opt = dictionary.decode(triple.subject); - let obj_opt = dictionary.decode(triple.object); - - if let (Some(subj), Some(obj)) = (subj_opt, obj_opt) { - process_join_efficiently( - &subj, - &obj, - subject_var, - object_var, - both_vars_bound, - subject_var_bound, - object_var_bound, - neither_var_bound, - final_results_arc, - local_results, - ); - } - } + // Pre-filter triples to avoid unnecessary decoding + for triple in triple_chunk { + let pred_opt = dictionary.decode(triple.predicate); + if pred_opt.is_none() || pred_opt.as_ref().unwrap().as_bytes() != predicate_bytes { + continue; + } + + if let Some(filter_bytes) = literal_filter_bytes { + let obj_opt = dictionary.decode(triple.object); + if obj_opt.is_none() || obj_opt.as_ref().unwrap().as_bytes() != *filter_bytes { + continue; + } + + if let Some(subj) = dictionary.decode(triple.subject) { + process_join_efficiently( + &subj, + obj_opt.unwrap(), + subject_var, + object_var, + both_vars_bound, + subject_var_bound, + object_var_bound, + neither_var_bound, + final_results_arc, + local_results, + ); + } + } else { + let subj_opt = dictionary.decode(triple.subject); + let obj_opt = dictionary.decode(triple.object); + + if let (Some(subj), Some(obj)) = (subj_opt, obj_opt) { + process_join_efficiently( + &subj, + &obj, + subject_var, + object_var, + both_vars_bound, + subject_var_bound, + object_var_bound, + neither_var_bound, + final_results_arc, + local_results, + ); + } } + } } #[inline(always)] fn process_join_efficiently<'a>( - subject: &str, - object: &str, - subject_var: &'a str, - object_var: &'a str, - both_vars_bound: &Arc>>, - subject_var_bound: &Arc>>, - object_var_bound: &Arc>>, - neither_var_bound: &Arc>, - final_results_arc: &Arc>>, - local_results: &mut Vec>, + subject: &str, + object: &str, + subject_var: &'a str, + object_var: &'a str, + both_vars_bound: &Arc>>, + subject_var_bound: &Arc>>, + object_var_bound: &Arc>>, + neither_var_bound: &Arc>, + final_results_arc: &Arc>>, + local_results: &mut Vec>, ) { - if let Some(result_indices) = both_vars_bound.get(&(subject.to_string(), object.to_string())) { - for &idx in result_indices { - // Clone efficiently with pre-allocation - let result = final_results_arc[idx].clone(); - local_results.push(result); - } - return; // Early return after handling the most restrictive case - } - - // Check for subject var bound - second most restrictive - if let Some(result_indices) = subject_var_bound.get(subject) { - for &idx in result_indices { - let base_result = &final_results_arc[idx]; - // Check for object consistency if it exists - if let Some(existing_object) = base_result.get(object_var) { - if existing_object == object { - local_results.push(base_result.clone()); - } - } else { - let mut extended_result = base_result.clone(); - extended_result.insert(object_var, object.to_string()); - local_results.push(extended_result); - } - } - } - - // Check for object var bound - if let Some(result_indices) = object_var_bound.get(object) { - for &idx in result_indices { - let base_result = &final_results_arc[idx]; - if let Some(existing_subject) = base_result.get(subject_var) { - if existing_subject == subject { - local_results.push(base_result.clone()); - } - } else { - let mut extended_result = base_result.clone(); - extended_result.insert(subject_var, subject.to_string()); - local_results.push(extended_result); - } - } - } - - // Process least restrictive case - neither var bound - for &idx in neither_var_bound.iter() { - let base_result = &final_results_arc[idx]; - - // Check both consistency constraints - let subject_consistent = base_result - .get(subject_var) - .map_or(true, |existing| existing == subject); - let object_consistent = base_result - .get(object_var) - .map_or(true, |existing| existing == object); - - if subject_consistent && object_consistent { - let mut extended_result = base_result.clone(); - - // Only insert if not already present - if !base_result.contains_key(subject_var) { - extended_result.insert(subject_var, subject.to_string()); - } - if !base_result.contains_key(object_var) { - extended_result.insert(object_var, object.to_string()); - } - - local_results.push(extended_result); - } - } + if let Some(result_indices) = both_vars_bound.get(&(subject.to_string(), object.to_string())) { + for &idx in result_indices { + // Clone efficiently with pre-allocation + let result = final_results_arc[idx].clone(); + local_results.push(result); + } + return; // Early return after handling the most restrictive case + } + + // Check for subject var bound - second most restrictive + if let Some(result_indices) = subject_var_bound.get(subject) { + for &idx in result_indices { + let base_result = &final_results_arc[idx]; + // Check for object consistency if it exists + if let Some(existing_object) = base_result.get(object_var) { + if existing_object == object { + local_results.push(base_result.clone()); + } + } else { + let mut extended_result = base_result.clone(); + extended_result.insert(object_var, object.to_string()); + local_results.push(extended_result); + } + } + } + + // Check for object var bound + if let Some(result_indices) = object_var_bound.get(object) { + for &idx in result_indices { + let base_result = &final_results_arc[idx]; + if let Some(existing_subject) = base_result.get(subject_var) { + if existing_subject == subject { + local_results.push(base_result.clone()); + } + } else { + let mut extended_result = base_result.clone(); + extended_result.insert(subject_var, subject.to_string()); + local_results.push(extended_result); + } + } + } + + // Process least restrictive case - neither var bound + for &idx in neither_var_bound.iter() { + let base_result = &final_results_arc[idx]; + + // Check both consistency constraints + let subject_consistent = base_result + .get(subject_var) + .map_or(true, |existing| existing == subject); + let object_consistent = base_result + .get(object_var) + .map_or(true, |existing| existing == object); + + if subject_consistent && object_consistent { + let mut extended_result = base_result.clone(); + + // Only insert if not already present + if !base_result.contains_key(subject_var) { + extended_result.insert(subject_var, subject.to_string()); + } + if !base_result.contains_key(object_var) { + extended_result.insert(object_var, object.to_string()); + } + + local_results.push(extended_result); + } + } } diff --git a/kolibrie/src/storage_manager.rs b/kolibrie/src/storage_manager.rs index 23b6fc7..ff5a61f 100644 --- a/kolibrie/src/storage_manager.rs +++ b/kolibrie/src/storage_manager.rs @@ -161,7 +161,7 @@ impl StorageManager { match self.current_backend { StorageBackend::Memory => { // Query the UnifiedIndex in memory database - self.memory_database.index_manager.query(s, p, o) + self.memory_database.index().query(s, p, o) } StorageBackend::Disk => { if let Some(ref lsm) = self.disk_database { @@ -188,7 +188,7 @@ impl StorageManager { // Create temporary database with the disk's index let mut temp_db = SparqlDatabase::new(); - temp_db.index_manager = unified_index; + temp_db.index_manager = Some(unified_index); // Share dictionary and prefixes from memory database temp_db.dictionary = self.memory_database.dictionary.clone(); @@ -222,7 +222,7 @@ impl StorageManager { } // Get all triples from memory's UnifiedIndex - let triples = self.memory_database.index_manager.query(None, None, None); + let triples = self.memory_database.index().query(None, None, None); // Insert into disk if let Some(ref lsm) = self.disk_database { @@ -259,7 +259,7 @@ impl StorageManager { /// Get statistics about current storage pub fn get_storage_stats(&self) -> StorageStats { - let memory_triples = self.memory_database.index_manager.query(None, None, None).len(); + let memory_triples = self.memory_database.index().query(None, None, None).len(); let disk_triples = if let Some(ref lsm) = self.disk_database { lsm.get_all_triples().len() } else { @@ -447,4 +447,4 @@ mod tests { // Cleanup std::fs::remove_dir_all("./test_storage_manager_migrate").ok(); } -} \ No newline at end of file +} diff --git a/kolibrie/src/streamertail_optimizer/execution/engine.rs b/kolibrie/src/streamertail_optimizer/execution/engine.rs index bce153c..fbf54c5 100644 --- a/kolibrie/src/streamertail_optimizer/execution/engine.rs +++ b/kolibrie/src/streamertail_optimizer/execution/engine.rs @@ -1006,7 +1006,7 @@ impl ExecutionEngine { match pattern { // FULLY BOUND (3 constants) - just check if triple exists (Term::Constant(s), Term::Constant(p), Term::Constant(o)) => { - if !database.index_manager.query(Some(*s), Some(*p), Some(*o)).is_empty() { + if !database.index().query(Some(*s), Some(*p), Some(*o)).is_empty() { return vec![HashMap::new()]; } else { return Vec::new(); @@ -1053,7 +1053,7 @@ impl ExecutionEngine { let object_var = object_var.strip_prefix('?').unwrap_or(&object_var).to_string(); // Try efficient two-key scan first - if let Some(objects) = database.index_manager.scan_sp(subject, predicate) { + if let Some(objects) = database.index().scan_sp(subject, predicate) { objects.iter().map(|&object| { let mut result = HashMap::with_capacity(1); result.insert(object_var.clone(), object); @@ -1061,7 +1061,7 @@ impl ExecutionEngine { }).collect() } else { // Fallback: query(Some(s), Some(p), None) - database.index_manager.query(Some(subject), Some(predicate), None) + database.index().query(Some(subject), Some(predicate), None) .into_iter() .map(|triple| { let mut result = HashMap::with_capacity(1); @@ -1081,7 +1081,7 @@ impl ExecutionEngine { ) -> Vec> { let predicate_var = predicate_var.strip_prefix('?').unwrap_or(&predicate_var).to_string(); - if let Some(predicates) = database.index_manager.scan_so(subject, object) { + if let Some(predicates) = database.index().scan_so(subject, object) { predicates.iter().map(|&predicate| { let mut result = HashMap::with_capacity(1); result.insert(predicate_var.clone(), predicate); @@ -1089,7 +1089,7 @@ impl ExecutionEngine { }).collect() } else { // Fallback: query(Some(s), None, Some(o)) - database.index_manager.query(Some(subject), None, Some(object)) + database.index().query(Some(subject), None, Some(object)) .into_iter() .map(|triple| { let mut result = HashMap::with_capacity(1); @@ -1109,7 +1109,7 @@ impl ExecutionEngine { ) -> Vec> { let subject_var = subject_var.strip_prefix('?').unwrap_or(&subject_var).to_string(); - if let Some(subjects) = database.index_manager.scan_po(predicate, object) { + if let Some(subjects) = database.index().scan_po(predicate, object) { subjects.iter().map(|&subject| { let mut result = HashMap::with_capacity(1); result.insert(subject_var.clone(), subject); @@ -1117,7 +1117,7 @@ impl ExecutionEngine { }).collect() } else { // Fallback: query(None, Some(p), Some(o)) - database.index_manager.query(None, Some(predicate), Some(object)) + database.index().query(None, Some(predicate), Some(object)) .into_iter() .map(|triple| { let mut result = HashMap::with_capacity(1); @@ -1138,7 +1138,7 @@ impl ExecutionEngine { let predicate_var = predicate_var.strip_prefix('?').unwrap_or(&predicate_var).to_string(); let object_var = object_var.strip_prefix('?').unwrap_or(&object_var).to_string(); - database.index_manager.query(Some(subject), None, None) + database.index().query(Some(subject), None, None) .into_iter() .map(|triple| { let mut result = HashMap::with_capacity(2); @@ -1159,7 +1159,7 @@ impl ExecutionEngine { let subject_var = subject_var.strip_prefix('?').unwrap_or(&subject_var).to_string(); let object_var = object_var.strip_prefix('?').unwrap_or(&object_var).to_string(); - database.index_manager.query(None, Some(predicate), None) + database.index().query(None, Some(predicate), None) .into_iter() .map(|triple| { let mut result = HashMap::with_capacity(2); @@ -1180,7 +1180,7 @@ impl ExecutionEngine { let subject_var = subject_var.strip_prefix('?').unwrap_or(&subject_var).to_string(); let predicate_var = predicate_var.strip_prefix('?').unwrap_or(&predicate_var).to_string(); - database.index_manager.query(None, None, Some(object)) + database.index().query(None, None, Some(object)) .into_iter() .map(|triple| { let mut result = HashMap::with_capacity(2); @@ -1190,4 +1190,4 @@ impl ExecutionEngine { }) .collect() } -} \ No newline at end of file +} diff --git a/shared/src/index_manager/dynamic_hexastore.rs b/shared/src/index_manager/dynamic_hexastore.rs index 0df27ea..b2bad25 100644 --- a/shared/src/index_manager/dynamic_hexastore.rs +++ b/shared/src/index_manager/dynamic_hexastore.rs @@ -47,3 +47,1040 @@ // after this. If so, and it is not the active index, delete that index from the pool. // By adding to only one index per pool and deleting unilaterally, eventually, unless we keep // switching active indexes, each pool will converge to have one index. +// +// +// +// +// + + + +use std::collections::HashSet; +use crate::terms::*; +use crate::triple::Triple; +use crate::index_manager::*; + +// ─── Cost heuristic constants ─────────────────────────────────────────────── +/// Fixed overhead per HashMap maintained in an index (models allocation, cache pressure, etc.) +const MAP_OVERHEAD: f64 = 5.0; +/// Fixed overhead per HashSet maintained in an index +const SET_OVERHEAD: f64 = 2.0; +/// Per-triple space overhead (models memory occupied by each stored value) +const SPACE_OVERHEAD: f64 = 0.01; + +// ─── Index type enum ──────────────────────────────────────────────────────── +/// The six possible triple-index orderings. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum IndexType { + SPO, // subject → predicate → object + SOP, // subject → object → predicate + PSO, // predicate → subject → object + POS, // predicate → object → subject + OSP, // object → subject → predicate + OPS, // object → predicate → subject +} + +impl IndexType { + /// All six permutations. + const ALL: [IndexType; 6] = [ + IndexType::SPO, IndexType::SOP, + IndexType::PSO, IndexType::POS, + IndexType::OSP, IndexType::OPS, + ]; + + /// Create a fresh, empty `Box` for this type. + fn create_empty(&self) -> Box { + match self { + IndexType::SPO => Box::new(SPOSingleIndex::new()), + IndexType::SOP => Box::new(SOPSingleIndex::new()), + IndexType::PSO => Box::new(PSOSingleIndex::new()), + IndexType::POS => Box::new(POSSingleIndex::new()), + IndexType::OSP => Box::new(OSPSingleIndex::new()), + IndexType::OPS => Box::new(OPSSingleIndex::new()), + } + } + + /// Which two-key scan does this index natively support? + fn native_scan(&self) -> ScanKind { + match self { + IndexType::SPO => ScanKind::SP, + IndexType::SOP => ScanKind::SO, + IndexType::PSO => ScanKind::PS, + IndexType::POS => ScanKind::PO, + IndexType::OSP => ScanKind::OS, + IndexType::OPS => ScanKind::OP, + } + } +} + +// ─── Access pattern helpers ───────────────────────────────────────────────── +/// The six possible two-key scans. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +enum ScanKind { SP, SO, PS, PO, OS, OP } + +/// Compact representation of which components of a triple pattern are bound. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +struct BoundPattern { + s: bool, + p: bool, + o: bool, +} + +impl BoundPattern { + fn from_triple_pattern(pat: &TriplePattern) -> Self { + Self { + s: matches!(pat.0, Term::Constant(_)), + p: matches!(pat.1, Term::Constant(_)), + o: matches!(pat.2, Term::Constant(_)), + } + } + + /// Which scans can serve this pattern efficiently (≤ 2 lookups to reach the leaf set)? + fn efficient_scans(&self) -> Vec { + match (self.s, self.p, self.o) { + // Two bound: exactly one native scan is ideal + (true, true, false) => vec![ScanKind::SP, ScanKind::PS], + (true, false, true) => vec![ScanKind::SO, ScanKind::OS], + (false, true, true) => vec![ScanKind::PO, ScanKind::OP], + // One bound: indexes where the bound component is the first key + (true, false, false) => vec![ScanKind::SP, ScanKind::SO], + (false, true, false) => vec![ScanKind::PS, ScanKind::PO], + (false, false, true) => vec![ScanKind::OS, ScanKind::OP], + // Fully bound or fully unbound: any index works + _ => vec![ScanKind::SP, ScanKind::SO, ScanKind::PS, ScanKind::PO, ScanKind::OS, ScanKind::OP], + } + } + + /// Which IndexTypes can serve this pattern efficiently? + fn efficient_index_types(&self) -> Vec { + self.efficient_scans() + .into_iter() + .map(|sk| match sk { + ScanKind::SP => IndexType::SPO, + ScanKind::SO => IndexType::SOP, + ScanKind::PS => IndexType::PSO, + ScanKind::PO => IndexType::POS, + ScanKind::OS => IndexType::OSP, + ScanKind::OP => IndexType::OPS, + }) + .collect() + } +} + +// ─── Cardinality snapshot (from DatabaseStats or estimated) ───────────────── +/// Lightweight snapshot of the unique-count statistics needed by the heuristic. +#[derive(Debug, Clone)] +pub struct CardinalitySnapshot { + pub num_subjects: f64, + pub num_predicates: f64, + pub num_objects: f64, +} + +impl CardinalitySnapshot { + /// Build from raw counts. + pub fn from_stats( + _total_triples: u64, + unique_subjects: usize, + unique_predicates: usize, + unique_objects: usize, + ) -> Self { + // Use at least 1.0 to avoid division-by-zero in cost formulas + Self { + num_subjects: (unique_subjects as f64).max(1.0), + num_predicates: (unique_predicates as f64).max(1.0), + num_objects: (unique_objects as f64).max(1.0), + } + } + + /// A default snapshot when we have no data yet. + fn unknown() -> Self { + Self { num_subjects: 1.0, num_predicates: 1.0, num_objects: 1.0 } + } +} + +// ─── Cost heuristic functions ─────────────────────────────────────────────── + +/// Estimate the query cost (number of hash lookups) for scanning `idx` to answer `pat`. +/// Lower is better. +fn query_cost(idx: IndexType, pat: BoundPattern, card: &CardinalitySnapshot) -> f64 { + let s = card.num_subjects; + let p = card.num_predicates; + let o = card.num_objects; + + // The three levels of the nested HashMap for this index type: + let (l1, l2, _l3) = index_level_sizes(idx, s, p, o); + + match (pat.s, pat.p, pat.o) { + // Fully bound – always 3 lookups regardless of index + (true, true, true) => 3.0, + + // Two bound + (true, true, false) => two_bound_cost(idx, 's', 'p', l1, l2), + (true, false, true) => two_bound_cost(idx, 's', 'o', l1, l2), + (false, true, true) => two_bound_cost(idx, 'p', 'o', l1, l2), + + // One bound + (true, false, false) => one_bound_cost(idx, 's', l1, l2), + (false, true, false) => one_bound_cost(idx, 'p', l1, l2), + (false, false, true) => one_bound_cost(idx, 'o', l1, l2), + + // No bound – full scan: iterate all level-1, all level-2 + (false, false, false) => l1 * l2, + } +} + +/// Return (#level1, #level2-per-l1, #level3-per-l2) for an index type given cardinalities. +fn index_level_sizes(idx: IndexType, s: f64, p: f64, o: f64) -> (f64, f64, f64) { + match idx { + IndexType::SPO => (s, p, o), + IndexType::SOP => (s, o, p), + IndexType::PSO => (p, s, o), + IndexType::POS => (p, o, s), + IndexType::OSP => (o, s, p), + IndexType::OPS => (o, p, s), + } +} + +/// Map a component char ('s','p','o') to the position (0,1,2) in the given index type. +fn component_position(idx: IndexType, comp: char) -> usize { + let order = match idx { + IndexType::SPO => ['s', 'p', 'o'], + IndexType::SOP => ['s', 'o', 'p'], + IndexType::PSO => ['p', 's', 'o'], + IndexType::POS => ['p', 'o', 's'], + IndexType::OSP => ['o', 's', 'p'], + IndexType::OPS => ['o', 'p', 's'], + }; + order.iter().position(|&c| c == comp).unwrap_or(2) +} + +/// Cost when exactly two components are bound. +fn two_bound_cost( + idx: IndexType, a: char, b: char, + l1: f64, l2: f64, +) -> f64 { + let pos_a = component_position(idx, a); + let pos_b = component_position(idx, b); + + match (pos_a, pos_b) { + // Both bound components match first two keys → 2 lookups (ideal) + (0, 1) | (1, 0) => 2.0, + // First key bound, third key bound → 1 lookup + iterate level-2 + (0, 2) | (2, 0) => 1.0 + l2, + // Second key bound, third key bound → iterate level-1, then 1 lookup each + (1, 2) | (2, 1) => l1 + l1, + _ => l1 * l2, // fallback + } +} + +/// Cost when exactly one component is bound. +fn one_bound_cost( + idx: IndexType, comp: char, + l1: f64, l2: f64, +) -> f64 { + let pos = component_position(idx, comp); + match pos { + 0 => 1.0 + l2, // first key bound → 1 lookup, iterate second level + 1 => l1, // second key bound → iterate first level, 1 lookup each + 2 => l1 * l2, // third key bound → full scan of first two levels + _ => l1 * l2, + } +} + +/// Estimate the maintenance cost of keeping an index alive. +fn maintenance_cost(idx: IndexType, card: &CardinalitySnapshot) -> f64 { + let (l1, l2, l3) = index_level_sizes(idx, card.num_subjects, card.num_predicates, card.num_objects); + MAP_OVERHEAD * (1.0 + l1) + SET_OVERHEAD * (l1 * l2) + SPACE_OVERHEAD * (l1 * l2 * l3) +} + +// ─── Index Pool ───────────────────────────────────────────────────────────── + +/// A pool of indexes that collectively cover all triples in the store. +/// Exactly one index is *active* (receives inserts); the rest are legacy +/// indexes being drained through deletions. +#[derive(Debug, Clone)] +struct IndexPool { + /// The index type that this pool ideally maintains. + desired_type: IndexType, + /// Index of the active (insert-target) index in `indexes`. + active_idx: usize, + /// The types corresponding to each index in `indexes`. + types: Vec, + /// The actual index implementations. + indexes: Vec>, +} + +impl IndexPool { + fn new(desired: IndexType) -> Self { + let idx = desired.create_empty(); + Self { + desired_type: desired, + active_idx: 0, + types: vec![desired], + indexes: vec![idx], + } + } + + /// Insert a triple into the active index only. + fn insert(&mut self, triple: &Triple) -> bool { + self.indexes[self.active_idx].insert(triple) + } + + /// Delete a triple from *every* index in the pool. + /// Also garbage-collect empty non-active indexes. + fn delete(&mut self, triple: &Triple) -> bool { + let mut any_deleted = false; + for idx in &mut self.indexes { + if idx.delete(triple) { + any_deleted = true; + } + } + // Garbage-collect empty non-active indexes (walk backwards for safe removal) + let mut i = self.indexes.len(); + while i > 0 { + i -= 1; + if i != self.active_idx && self.indexes[i].triple_count() == 0 { + self.indexes.remove(i); + self.types.remove(i); + // Adjust active_idx if it was shifted + if self.active_idx > i { + self.active_idx -= 1; + } + } + } + any_deleted + } + + /// Does this pool contain an index of the given type? + fn contains_type(&self, t: IndexType) -> bool { + self.types.contains(&t) + } + + /// Switch the active index to the given type. + /// If the type already exists in the pool, just change the active pointer. + /// Otherwise, create a new empty index of that type and make it active. + fn switch_active(&mut self, new_type: IndexType) { + if let Some(pos) = self.types.iter().position(|&t| t == new_type) { + self.active_idx = pos; + } else { + let new_idx = new_type.create_empty(); + self.indexes.push(new_idx); + self.types.push(new_type); + self.active_idx = self.indexes.len() - 1; + } + self.desired_type = new_type; + } + + /// Query: for correctness we must query ALL indexes in the pool and merge + /// results (new inserts go to the active index but older data may still + /// reside in legacy indexes). + fn query(&self, s: Option, p: Option, o: Option) -> Vec { + if self.indexes.len() == 1 { + return self.indexes[0].query(s, p, o); + } + // Merge results from all indexes, deduplicate + let mut seen = HashSet::new(); + let mut results = Vec::new(); + for idx in &self.indexes { + for triple in idx.query(s, p, o) { + if seen.insert((triple.subject, triple.predicate, triple.object)) { + results.push(triple); + } + } + } + results + } + + fn clear(&mut self) { + for idx in &mut self.indexes { + idx.clear(); + } + } + + fn triple_count(&self) -> usize { + // The union of all indexes; must deduplicate. + self.query(None, None, None).len() + } + + /// Helper: merge scan results from all indexes that support the scan. + fn merge_scan(&self, scan_fn: F) -> Option> + where + F: Fn(&Box) -> Option<&HashSet>, + { + let mut merged: Option> = None; + for idx in &self.indexes { + if let Some(set) = scan_fn(idx) { + match &mut merged { + Some(m) => m.extend(set.iter().copied()), + None => merged = Some(set.clone()), + } + } + } + merged.map(|s| s.into_iter().collect()) + } +} + +// ─── DynamicHexastoreIndex ────────────────────────────────────────────────── + +/// A dynamic hexastore that only builds the index permutations that are +/// actually needed by the registered access patterns, and can switch +/// the active index within each pool based on data-cardinality heuristics. +#[derive(Debug, Clone)] +pub struct DynamicHexastoreIndex { + /// The access patterns this index was initialised with. + access_patterns: Vec, + /// One pool per selected index permutation. + pools: Vec, + /// For each access pattern (by position in `access_patterns`), which pool index serves it. + pattern_to_pool: Vec, + /// How many triples have been inserted since last re-evaluation. + inserts_since_eval: usize, + /// How often (in inserts) to re-evaluate the heuristic. + eval_interval: usize, + /// Latest cardinality snapshot (updated during re-evaluation). + latest_card: CardinalitySnapshot, +} + +impl DynamicHexastoreIndex { + // ── Construction ──────────────────────────────────────────────────── + + /// Create a new `DynamicHexastoreIndex` tailored to the given access patterns. + /// + /// `eval_interval` controls how often (in number of inserts) the heuristic + /// is re-evaluated. A typical value is 10 000. + pub fn new(access_patterns: Vec, eval_interval: usize) -> Self { + let eval_interval = eval_interval.max(1); + let card = CardinalitySnapshot::unknown(); + + // Step 2: choose initial indexes purely from access patterns + let needed = Self::initial_index_set(&access_patterns); + + // Step 3: create pools & assign patterns + let pools: Vec = needed.iter().map(|&t| IndexPool::new(t)).collect(); + let pattern_to_pool = Self::assign_patterns_to_pools(&access_patterns, &pools, &card); + + Self { + access_patterns, + pools, + pattern_to_pool, + inserts_since_eval: 0, + eval_interval, + latest_card: card, + } + } + + /// Convenience: create with default eval interval. + pub fn with_patterns(access_patterns: Vec) -> Self { + Self::new(access_patterns, 10_000) + } + + // ── Step 2: choose initial indexes ────────────────────────────────── + + fn initial_index_set(patterns: &[TriplePattern]) -> Vec { + if patterns.is_empty() { + // Fallback: just use SPO + return vec![IndexType::SPO]; + } + + let mut needed: HashSet = HashSet::new(); + + for pat in patterns { + let bp = BoundPattern::from_triple_pattern(pat); + let candidates = bp.efficient_index_types(); + if candidates.len() == 1 { + // Only one good index for this pattern → must include it + needed.insert(candidates[0]); + } + } + + // For patterns that have multiple candidates, try to reuse already-selected indexes + for pat in patterns { + let bp = BoundPattern::from_triple_pattern(pat); + let candidates = bp.efficient_index_types(); + if candidates.len() > 1 { + // Prefer an already-needed index + let reuse = candidates.iter().find(|c| needed.contains(c)); + if reuse.is_none() { + // Pick the first candidate + needed.insert(candidates[0]); + } + } + } + + if needed.is_empty() { + needed.insert(IndexType::SPO); + } + + needed.into_iter().collect() + } + + // ── Step 3: assign patterns to pools ──────────────────────────────── + + fn assign_patterns_to_pools( + patterns: &[TriplePattern], + pools: &[IndexPool], + card: &CardinalitySnapshot, + ) -> Vec { + patterns + .iter() + .map(|pat| { + let bp = BoundPattern::from_triple_pattern(pat); + // Find the pool whose desired_type gives the lowest query cost + pools + .iter() + .enumerate() + .min_by(|(_, a), (_, b)| { + let ca = query_cost(a.desired_type, bp, card); + let cb = query_cost(b.desired_type, bp, card); + ca.partial_cmp(&cb).unwrap_or(std::cmp::Ordering::Equal) + }) + .map(|(i, _)| i) + .unwrap_or(0) + }) + .collect() + } + + // ── Step 5: re-evaluate with heuristic ────────────────────────────── + + /// Update the cardinality snapshot and optionally re-evaluate which + /// indexes to maintain. + pub fn update_cardinalities(&mut self, card: CardinalitySnapshot) { + self.latest_card = card; + } + + /// Force a re-evaluation of the index set based on the current cardinalities. + pub fn reevaluate(&mut self) { + // Clone the snapshot so we don't hold an immutable borrow on `self` + // while calling `transition_pools(&mut self, …)`. + let card = self.latest_card.clone(); + let patterns = &self.access_patterns; + + // Find the optimal set of index types + let best_set = Self::find_best_index_set(patterns, &card); + + // Transition pools + self.transition_pools(best_set, &card); + } + + /// Find the set of IndexTypes that minimises total cost. + fn find_best_index_set(patterns: &[TriplePattern], card: &CardinalitySnapshot) -> Vec { + // We enumerate subsets of IndexType::ALL that can cover every pattern. + // Since there are only 6 types, 2^6 = 64 subsets — perfectly tractable. + + let all = IndexType::ALL; + let n = all.len(); + let mut best_cost = f64::MAX; + let mut best_set: Vec = vec![IndexType::SPO]; // fallback + + let bound_patterns: Vec = patterns + .iter() + .map(|p| BoundPattern::from_triple_pattern(p)) + .collect(); + + for mask in 1u32..(1 << n) { + let set: Vec = (0..n) + .filter(|&i| mask & (1 << i) != 0) + .map(|i| all[i]) + .collect(); + + // Check that every pattern can be served by at least one index in the set + let covers_all = bound_patterns.iter().all(|bp| { + let efficient = bp.efficient_index_types(); + // Either a native efficient index is in the set, + // or at least some index exists (fallback to full scan) + efficient.iter().any(|e| set.contains(e)) || !set.is_empty() + }); + + if !covers_all { + continue; + } + + // Compute total cost = query costs + maintenance costs + let query_total: f64 = bound_patterns + .iter() + .map(|bp| { + // Pick the best index from the set for this pattern + set.iter() + .map(|&idx| query_cost(idx, *bp, card)) + .fold(f64::MAX, f64::min) + }) + .sum(); + + let maint_total: f64 = set.iter().map(|&idx| maintenance_cost(idx, card)).sum(); + + let total = query_total + maint_total; + if total < best_cost { + best_cost = total; + best_set = set; + } + } + + best_set + } + + /// Transition from old pools to new pools following the transition steps. + fn transition_pools(&mut self, new_types: Vec, card: &CardinalitySnapshot) { + // Step 1: create new pool descriptors + let mut new_pools: Vec = new_types.iter().map(|&t| IndexPool::new(t)).collect(); + + // Step 2: assign every access pattern to the best new pool + let new_assignment = Self::assign_patterns_to_pools( + &self.access_patterns, + &new_pools, + card, + ); + + // Step 3: for every new pool, find the best matching old pool + for (new_pool_idx, new_pool) in new_pools.iter_mut().enumerate() { + // Collect access patterns assigned to this new pool + let assigned_pats: Vec = new_assignment + .iter() + .enumerate() + .filter(|(_, &pool_idx)| pool_idx == new_pool_idx) + .map(|(pat_idx, _)| pat_idx) + .collect(); + + if assigned_pats.is_empty() { + continue; + } + + // Find the old pool that minimizes cost for these patterns + let best_old = self.pools.iter().enumerate().min_by(|(_, a), (_, b)| { + let cost_a: f64 = assigned_pats.iter().map(|&pi| { + let bp = BoundPattern::from_triple_pattern(&self.access_patterns[pi]); + query_cost(a.desired_type, bp, card) + }).sum(); + let cost_b: f64 = assigned_pats.iter().map(|&pi| { + let bp = BoundPattern::from_triple_pattern(&self.access_patterns[pi]); + query_cost(b.desired_type, bp, card) + }).sum(); + cost_a.partial_cmp(&cost_b).unwrap_or(std::cmp::Ordering::Equal) + }); + + if let Some((old_idx, _)) = best_old { + // Step 5: check if the desired index type already exists in the old pool + let old_pool = &self.pools[old_idx]; + if old_pool.contains_type(new_pool.desired_type) { + // Reuse the old pool, just switch active + let mut reused = old_pool.clone(); + reused.switch_active(new_pool.desired_type); + *new_pool = reused; + } else { + // Create a new index of the desired type, keep old indexes for data + let mut merged = old_pool.clone(); + merged.switch_active(new_pool.desired_type); + *new_pool = merged; + } + } + } + + // Step 4: old pools not assigned to any new pool are simply dropped + self.pools = new_pools; + self.pattern_to_pool = new_assignment; + } + + /// Called after each insert to possibly trigger re-evaluation. + fn maybe_reevaluate(&mut self) { + self.inserts_since_eval += 1; + if self.inserts_since_eval >= self.eval_interval { + self.inserts_since_eval = 0; + self.reevaluate(); + } + } +} + +// ─── TripleIndex implementation ───────────────────────────────────────────── + +impl TripleIndex for DynamicHexastoreIndex { + fn clone_empty(&self) -> Box { + Box::new(DynamicHexastoreIndex::new( + self.access_patterns.clone(), + self.eval_interval, + )) + } + + fn clone_box(&self) -> Box { + Box::new(self.clone()) + } + + fn triple_count(&self) -> usize { + // Any single pool contains all triples (by the pool invariant). + if self.pools.is_empty() { + return 0; + } + self.pools[0].triple_count() + } + + fn supported_access_patterns(&self) -> AccessPatternSupport { + // Report the union of all pools' native scans + let mut support = AccessPatternSupport { + sp: false, so: false, po: false, + ps: false, os: false, op: false, + }; + for pool in &self.pools { + match pool.desired_type.native_scan() { + ScanKind::SP => support.sp = true, + ScanKind::SO => support.so = true, + ScanKind::PO => support.po = true, + ScanKind::PS => support.ps = true, + ScanKind::OS => support.os = true, + ScanKind::OP => support.op = true, + } + } + support + } + + // ── Mutation ──────────────────────────────────────────────────────── + + fn insert(&mut self, triple: &Triple) -> bool { + if self.pools.is_empty() { + return false; + } + + // Step 4: add to every pool exactly once (via each pool's active index) + let mut any_new = false; + for pool in &mut self.pools { + if pool.insert(triple) { + any_new = true; + } + } + + if any_new { + self.maybe_reevaluate(); + } + + any_new + } + + fn delete(&mut self, triple: &Triple) -> bool { + // Step 6: delete from every index in every pool + let mut any_deleted = false; + for pool in &mut self.pools { + if pool.delete(triple) { + any_deleted = true; + } + } + any_deleted + } + + fn clear(&mut self) { + for pool in &mut self.pools { + pool.clear(); + } + } + + fn build_from_triples(&mut self, triples: &[Triple]) { + self.clear(); + // Simple insert loop (pools handle the routing) + for triple in triples { + for pool in &mut self.pools { + pool.insert(triple); + } + } + // After bulk load, gather rough cardinalities and reevaluate + let mut subjects = HashSet::new(); + let mut predicates = HashSet::new(); + let mut objects = HashSet::new(); + for t in triples { + subjects.insert(t.subject); + predicates.insert(t.predicate); + objects.insert(t.object); + } + self.latest_card = CardinalitySnapshot::from_stats( + triples.len() as u64, + subjects.len(), + predicates.len(), + objects.len(), + ); + self.reevaluate(); + } + + // ── Query ─────────────────────────────────────────────────────────── + + fn query(&self, s: Option, p: Option, o: Option) -> Vec { + // Any pool contains all the data, so we pick the best pool for this query shape. + if self.pools.is_empty() { + return Vec::new(); + } + + let bp = BoundPattern { + s: s.is_some(), + p: p.is_some(), + o: o.is_some(), + }; + + // Find the pool with the lowest query cost for this pattern + let best_pool = self.pools + .iter() + .min_by(|a, b| { + let ca = query_cost(a.desired_type, bp, &self.latest_card); + let cb = query_cost(b.desired_type, bp, &self.latest_card); + ca.partial_cmp(&cb).unwrap_or(std::cmp::Ordering::Equal) + }) + .unwrap(); + + best_pool.query(s, p, o) + } + + fn get_matching_triples(&self, pattern: &TriplePattern) -> Vec { + let (s, p, o) = pattern; + let sub = match s { Term::Constant(x) => Some(*x), Term::Variable(_) => None }; + let pre = match p { Term::Constant(x) => Some(*x), Term::Variable(_) => None }; + let obj = match o { Term::Constant(x) => Some(*x), Term::Variable(_) => None }; + self.query(sub, pre, obj) + } + + // ── Two-key scans ─────────────────────────────────────────────────── + // These delegate to the pool whose desired_type natively supports the scan. + // Because pools may have multiple internal indexes, the pool merges results. + // We return None if no pool supports it natively with a single index + // (the engine will fall back to query() + filter). + + fn scan_sp(&self, s: u32, p: u32) -> Option<&HashSet> { + for pool in &self.pools { + if pool.indexes.len() == 1 && pool.desired_type == IndexType::SPO { + return pool.indexes[0].scan_sp(s, p); + } + } + None + } + + fn scan_so(&self, s: u32, o: u32) -> Option<&HashSet> { + for pool in &self.pools { + if pool.indexes.len() == 1 && pool.desired_type == IndexType::SOP { + return pool.indexes[0].scan_so(s, o); + } + } + None + } + + fn scan_po(&self, p: u32, o: u32) -> Option<&HashSet> { + for pool in &self.pools { + if pool.indexes.len() == 1 && pool.desired_type == IndexType::POS { + return pool.indexes[0].scan_po(p, o); + } + } + None + } + + fn scan_ps(&self, p: u32, s: u32) -> Option<&HashSet> { + for pool in &self.pools { + if pool.indexes.len() == 1 && pool.desired_type == IndexType::PSO { + return pool.indexes[0].scan_ps(p, s); + } + } + None + } + + fn scan_os(&self, o: u32, s: u32) -> Option<&HashSet> { + for pool in &self.pools { + if pool.indexes.len() == 1 && pool.desired_type == IndexType::OSP { + return pool.indexes[0].scan_os(o, s); + } + } + None + } + + fn scan_op(&self, o: u32, p: u32) -> Option<&HashSet> { + for pool in &self.pools { + if pool.indexes.len() == 1 && pool.desired_type == IndexType::OPS { + return pool.indexes[0].scan_op(o, p); + } + } + None + } + + // ── Bulk / optimisation ───────────────────────────────────────────── + + fn optimize(&mut self) { + for pool in &mut self.pools { + for idx in &mut pool.indexes { + idx.optimize(); + } + } + } +} + +// ─── Tests ────────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + use crate::triple::Triple; + + fn make_triple(s: u32, p: u32, o: u32) -> Triple { + Triple { subject: s, predicate: p, object: o } + } + + #[test] + fn test_insert_and_query_basic() { + // Access pattern: (?s, p=1, ?o) — only predicate bound + let patterns = vec![ + (Term::Variable("s".into()), Term::Constant(1), Term::Variable("o".into())), + ]; + let mut idx = DynamicHexastoreIndex::with_patterns(patterns); + + assert!(idx.insert(&make_triple(10, 1, 100))); + assert!(idx.insert(&make_triple(20, 1, 200))); + assert!(idx.insert(&make_triple(30, 2, 300))); + + let result = idx.query(None, Some(1), None); + assert_eq!(result.len(), 2); + + let result_all = idx.query(None, None, None); + assert_eq!(result_all.len(), 3); + } + + #[test] + fn test_delete_removes_from_all_pools() { + let patterns = vec![ + (Term::Constant(1), Term::Variable("p".into()), Term::Variable("o".into())), + (Term::Variable("s".into()), Term::Constant(2), Term::Variable("o".into())), + ]; + let mut idx = DynamicHexastoreIndex::with_patterns(patterns); + + idx.insert(&make_triple(1, 2, 3)); + idx.insert(&make_triple(1, 2, 4)); + assert_eq!(idx.triple_count(), 2); + + assert!(idx.delete(&make_triple(1, 2, 3))); + assert_eq!(idx.triple_count(), 1); + + // Deleting non-existent triple returns false + assert!(!idx.delete(&make_triple(99, 99, 99))); + } + + #[test] + fn test_duplicate_insert_returns_false() { + let patterns = vec![ + (Term::Variable("s".into()), Term::Variable("p".into()), Term::Variable("o".into())), + ]; + let mut idx = DynamicHexastoreIndex::with_patterns(patterns); + + assert!(idx.insert(&make_triple(1, 2, 3))); + assert!(!idx.insert(&make_triple(1, 2, 3))); + assert_eq!(idx.triple_count(), 1); + } + + #[test] + fn test_build_from_triples() { + let patterns = vec![ + (Term::Variable("s".into()), Term::Constant(1), Term::Variable("o".into())), + (Term::Constant(10), Term::Variable("p".into()), Term::Variable("o".into())), + ]; + let mut idx = DynamicHexastoreIndex::with_patterns(patterns); + + let triples: Vec = (0..100) + .map(|i| make_triple(i % 10, i % 5, i)) + .collect(); + idx.build_from_triples(&triples); + + assert_eq!(idx.triple_count(), 100); + + // Query specific predicate + let p1 = idx.query(None, Some(1), None); + assert_eq!(p1.len(), 20); // i % 5 == 1 for i=1,6,11,...,96 → 20 triples + } + + #[test] + fn test_clear() { + let patterns = vec![ + (Term::Variable("s".into()), Term::Variable("p".into()), Term::Variable("o".into())), + ]; + let mut idx = DynamicHexastoreIndex::with_patterns(patterns); + idx.insert(&make_triple(1, 2, 3)); + idx.insert(&make_triple(4, 5, 6)); + assert_eq!(idx.triple_count(), 2); + + idx.clear(); + assert_eq!(idx.triple_count(), 0); + } + + #[test] + fn test_get_matching_triples() { + let patterns = vec![ + (Term::Constant(1), Term::Constant(2), Term::Variable("o".into())), + ]; + let mut idx = DynamicHexastoreIndex::with_patterns(patterns); + idx.insert(&make_triple(1, 2, 10)); + idx.insert(&make_triple(1, 2, 20)); + idx.insert(&make_triple(1, 3, 30)); + + let pat = (Term::Constant(1), Term::Constant(2), Term::Variable("o".into())); + let result = idx.get_matching_triples(&pat); + assert_eq!(result.len(), 2); + } + + #[test] + fn test_reevaluate_does_not_lose_data() { + let patterns = vec![ + (Term::Variable("s".into()), Term::Constant(1), Term::Variable("o".into())), + ]; + let mut idx = DynamicHexastoreIndex::new(patterns, 5); + + // Insert enough to trigger re-evaluation + for i in 0..20 { + idx.insert(&make_triple(i, 1, i * 10)); + } + // Update cardinalities and force reevaluate + idx.update_cardinalities(CardinalitySnapshot { + num_subjects: 20.0, + num_predicates: 1.0, + num_objects: 20.0, + }); + idx.reevaluate(); + + // Data should still be there + assert_eq!(idx.triple_count(), 20); + let result = idx.query(None, Some(1), None); + assert_eq!(result.len(), 20); + } + + #[test] + fn test_supported_access_patterns() { + // Pattern needs SP scan �� should report sp=true + let patterns = vec![ + (Term::Constant(1), Term::Constant(2), Term::Variable("o".into())), + ]; + let idx = DynamicHexastoreIndex::with_patterns(patterns); + let support = idx.supported_access_patterns(); + // At least one of SP or PS should be supported + assert!(support.sp || support.ps); + } + + #[test] + fn test_cost_functions() { + let card = CardinalitySnapshot { + num_subjects: 100.0, + num_predicates: 10.0, + num_objects: 50.0, + }; + + // Querying SPO for bound s,p should be cheap (2 lookups) + let cost_sp = query_cost(IndexType::SPO, BoundPattern { s: true, p: true, o: false }, &card); + assert_eq!(cost_sp, 2.0); + + // Querying SPO for bound o only should be expensive (full scan) + let cost_o = query_cost(IndexType::SPO, BoundPattern { s: false, p: false, o: true }, &card); + assert!(cost_o > 10.0); + + // OPS should be cheap for bound o + let cost_o_ops = query_cost(IndexType::OPS, BoundPattern { s: false, p: false, o: true }, &card); + assert!(cost_o_ops < cost_o); + } + + #[test] + fn test_clone_empty_and_clone_box() { + let patterns = vec![ + (Term::Variable("s".into()), Term::Constant(1), Term::Variable("o".into())), + ]; + let mut idx = DynamicHexastoreIndex::with_patterns(patterns); + idx.insert(&make_triple(1, 1, 10)); + + let empty = idx.clone_empty(); + assert_eq!(empty.triple_count(), 0); + + let cloned = idx.clone_box(); + assert_eq!(cloned.triple_count(), 1); + } +} diff --git a/shared/src/index_manager/mod.rs b/shared/src/index_manager/mod.rs index f68c683..2d37240 100644 --- a/shared/src/index_manager/mod.rs +++ b/shared/src/index_manager/mod.rs @@ -21,6 +21,7 @@ pub use pso_single::PSOSingleIndex; pub use sop_single::SOPSingleIndex; pub use spo_single::SPOSingleIndex; pub use single_table::SingleTableIndex; +pub use dynamic_hexastore::DynamicHexastoreIndex; pub mod hexastore; pub mod ops_single; pub mod osp_single; @@ -29,7 +30,49 @@ pub mod pso_single; pub mod sop_single; pub mod spo_single; pub mod single_table; +pub mod dynamic_hexastore; +#[derive(Debug, Clone)] +pub enum IndexConfig { + /// Full hexastore — all 6 permutations, no questions asked. + Hexastore, + + /// A single permutation index. + SPO, + POS, + OSP, + PSO, + OPS, + SOP, + + /// Flat single-table index. + SingleTable, + + /// Dynamic hexastore: builds only the permutations the workload + /// actually needs and re-evaluates periodically. + /// + /// `eval_interval` — number of inserts between re-evaluation. + /// `queries` — raw SPARQL strings; SparqlDatabase will parse + /// them through the real parser + dictionary and + /// pass the resolved `Vec` to the + /// index constructor. + DynamicHexastore { + eval_interval: u64, + queries: Vec, + }, + + // ── Future index types go here ── + // YourNewIndex { + // some_setting: usize, + // queries: Vec, // if it needs resolved patterns + // }, +} + +impl Default for IndexConfig { + fn default() -> Self { + IndexConfig::Hexastore + } +} /// Describes which access patterns an index can serve efficiently. #[derive(Debug, Clone)] From 22165924e974d74b47df9b99c38cfd3314274e73 Mon Sep 17 00:00:00 2001 From: Mirovh Date: Fri, 6 Mar 2026 10:20:25 +0100 Subject: [PATCH 10/23] Fix all indexes shell file --- .../n_triples_data/n_triple_10M_all_indexes.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh index 93e5087..c8ce8dc 100755 --- a/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh +++ b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh @@ -16,8 +16,15 @@ TIMESTAMP=$(date +%Y%m%d_%H%M%S) RESULT_DIR="${OUTPUT_DIR}/${TIMESTAMP}" INDEX_TYPES=( - "dynamic_hexastore", + "dynamic_hexastore" "hexastore" + "ops" + "osp" + "pos" + "pso" + "sop" + "spo" + "table" ) echo "==============================================" From 35cea67bb134f841ec7598ae0e097c92da291dc2 Mon Sep 17 00:00:00 2001 From: Mirovh <94124714+Mirovh@users.noreply.github.com> Date: Mon, 23 Mar 2026 01:20:09 +0100 Subject: [PATCH 11/23] Implement buckets --- .../n_triples_data/n_triple_10M.rs | 277 +++++++++--------- .../n_triple_10M_all_indexes.sh | 1 + 2 files changed, 140 insertions(+), 138 deletions(-) diff --git a/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M.rs b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M.rs index d1e2f3d..b6ae8a6 100644 --- a/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M.rs +++ b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M.rs @@ -30,115 +30,11 @@ use std::io::{BufRead, BufReader}; use std::time::Instant; use shared::index_manager::*; -fn make_config_from_env() -> (String, IndexConfig) { - let index_type = std::env::var("INDEX_TYPE") - .unwrap_or_else(|_| "hexastore".to_string()) - .to_lowercase(); - - let config = match index_type.as_str() { - "hexastore" | "" => IndexConfig::Hexastore, - "spo" => IndexConfig::SPO, - "pos" => IndexConfig::POS, - "osp" => IndexConfig::OSP, - "pso" => IndexConfig::PSO, - "ops" => IndexConfig::OPS, - "sop" => IndexConfig::SOP, - "table" => IndexConfig::SingleTable, - "dynamic" => IndexConfig::DynamicHexastore { - eval_interval: 1000, - queries: vec![], // or read from another env var - }, - other => { - eprintln!( - "WARNING: Unknown INDEX_TYPE '{}', falling back to hexastore.", - other - ); - IndexConfig::Hexastore - } - }; - - (index_type, config) -} - -fn parse_large_ntriples_file( - file_path: &str, -) -> Result> { - let (index_name, config) = make_config_from_env(); - println!("INDEX_TYPE = {}", index_name); - println!("Starting to parse N-Triples file: {}", file_path); - let start_time = Instant::now(); - - //let mut db = SparqlDatabase::new(); - //let mut db = SparqlDatabase::with_index(Box::new(SPOSingleIndex::new())); - let mut db = SparqlDatabase::with_config(config); - - // Much smaller buffer and more aggressive memory management - let file = File::open(file_path)?; - let reader = BufReader::with_capacity(64 * 1024, file); // Reduced buffer size +type QuerySpec = (&'static str, &'static str); - let mut line_count = 0; - let mut batch_lines = Vec::new(); - const BATCH_SIZE: usize = 10_000; // Much smaller batch size - - for line_result in reader.lines() { - let line = line_result?; - - if line.trim().is_empty() || line.starts_with('#') { - continue; - } - - batch_lines.push(line); - line_count += 1; - - if batch_lines.len() >= BATCH_SIZE { - // Process batch immediately - let batch_data = batch_lines.join("\n"); - db.parse_ntriples_and_add(&batch_data); - - // Aggressive cleanup - batch_lines.clear(); - batch_lines.shrink_to_fit(); - - // Progress info every 100k triples - if line_count % 100_000 == 0 { - println!("Processed {} triples", line_count); - std::hint::black_box(()); - - // Optional: small delay to let the system breathe - std::thread::sleep(std::time::Duration::from_millis(10)); - } - } - } - - // Process remaining batch - if !batch_lines.is_empty() { - let batch_data = batch_lines.join("\n"); - db.parse_ntriples_and_add(&batch_data); - } - db.get_or_build_stats(); - - println!( - "Finished parsing {} triples in {:.2} seconds", - line_count, - start_time.elapsed().as_secs_f64() - ); - - // Build indexes after parsing - this is where the magic happens - println!("Building indexes..."); - let index_start = Instant::now(); - db.build_all_indexes(); - println!("Indexes built in {:.2} seconds", index_start.elapsed().as_secs_f64()); - - Ok(db) -} - -fn run_all_queries(db: &mut SparqlDatabase) { - const ITERATIONS: usize = 3; - - // (name, query) - let queries: &[(&str, &str)] = &[ - // C1 - ( +fn workload_queries() -> Vec { + vec![ + ( "C1", r#"PREFIX wsdbm: PREFIX sorg: @@ -602,45 +498,150 @@ fn run_all_queries(db: &mut SparqlDatabase) { } "#, ), - ]; + ] +} - for (name, query) in queries.iter() { - println!("=============================================="); - println!("Running query {} ({} iterations)...", name, ITERATIONS); +fn queries_for_index_manager(workload: &[QuerySpec]) -> Vec { + workload + .iter() + .map(|(_, q)| q.trim().to_string()) + .collect() +} - let mut total_time = 0.0; - // let mut last_result:Vec> = Vec::new(); +fn make_config_from_env(queries: Vec) -> (String, IndexConfig) { + let index_type = std::env::var("INDEX_TYPE") + .unwrap_or_else(|_| "hexastore".to_string()) + .to_lowercase(); - for _ in 0..ITERATIONS { - let start = Instant::now(); - let _ = execute_query_rayon_parallel2_volcano(query, db); - let elapsed = start.elapsed().as_secs_f64(); - total_time += elapsed; - } + let config = match index_type.as_str() { + "hexastore" | "" => IndexConfig::Hexastore, + "spo" => IndexConfig::SPO, + "pos" => IndexConfig::POS, + "osp" => IndexConfig::OSP, + "pso" => IndexConfig::PSO, + "ops" => IndexConfig::OPS, + "sop" => IndexConfig::SOP, + "table" => IndexConfig::SingleTable, + "dynamic" => IndexConfig::DynamicHexastore { + eval_interval: 1000, + queries, + }, + "buckets" => IndexConfig::Buckets { queries }, + other => { + eprintln!( + "WARNING: Unknown INDEX_TYPE '{}', falling back to hexastore.", + other + ); + IndexConfig::Hexastore + } + }; - let avg = total_time / (ITERATIONS as f64); - println!("Average time for {}: {:.6} seconds", name, avg); - } + (index_type, config) } -fn main() { - // Set current directory to the root of the project - std::env::set_current_dir(std::path::Path::new(env!("CARGO_MANIFEST_DIR"))) - .expect("Failed to set project root as current directory"); +fn parse_large_ntriples_file( + file_path: &str, + workload: &[QuerySpec], +) -> Result> { + let (index_name, config) = make_config_from_env(queries_for_index_manager(workload)); + println!("INDEX_TYPE = {}", index_name); + println!("Starting to parse N-Triples file: {}", file_path); + + let start_time = Instant::now(); + let mut db = SparqlDatabase::with_config(config); + + let file = File::open(file_path)?; + let reader = BufReader::with_capacity(64 * 1024, file); + + let mut line_count = 0; + let mut batch_lines = Vec::new(); + const BATCH_SIZE: usize = 10_000; + + for line_result in reader.lines() { + let line = line_result?; + + if line.trim().is_empty() || line.starts_with('#') { + continue; + } + + batch_lines.push(line); + line_count += 1; + + if batch_lines.len() >= BATCH_SIZE { + let batch_data = batch_lines.join("\n"); + db.parse_ntriples_and_add(&batch_data); - let file_path = "../benchmark_dataset/watdiv.10M.nt"; + batch_lines.clear(); + batch_lines.shrink_to_fit(); - match parse_large_ntriples_file(file_path) { - Ok(mut db) => { - println!("Successfully processed N-Triples file"); - run_all_queries(&mut db); + if line_count % 100_000 == 0 { + println!("Processed {} triples", line_count); + std::thread::sleep(std::time::Duration::from_millis(10)); + } + } + } + + if !batch_lines.is_empty() { + let batch_data = batch_lines.join("\n"); + db.parse_ntriples_and_add(&batch_data); } - Err(e) => { - eprintln!("Error processing file '{}': {}", file_path, e); - println!( - "File not found or error occurred. \ - Make sure ../benchmark_dataset/watdiv.10M.nt exists." + + db.get_or_build_stats(); + + println!( + "Finished parsing {} triples in {:.2} seconds", + line_count, + start_time.elapsed().as_secs_f64() + ); + + println!("Building indexes..."); + let index_start = Instant::now(); + db.build_all_indexes(); + println!( + "Indexes built in {:.2} seconds", + index_start.elapsed().as_secs_f64() ); + + Ok(db) +} + +fn run_all_queries(db: &mut SparqlDatabase, workload: &[QuerySpec]) { + const ITERATIONS: usize = 3; + + for (name, query) in workload.iter() { + println!("=============================================="); + println!("Running query {} ({} iterations)...", name, ITERATIONS); + + let mut total_time = 0.0; + + for _ in 0..ITERATIONS { + let start = Instant::now(); + let _ = execute_query_rayon_parallel2_volcano(query, db); + total_time += start.elapsed().as_secs_f64(); + } + + let avg = total_time / (ITERATIONS as f64); + println!("Average time for {}: {:.6} seconds", name, avg); } - } } + +fn main() { + std::env::set_current_dir(std::path::Path::new(env!("CARGO_MANIFEST_DIR"))) + .expect("Failed to set project root as current directory"); + + let file_path = "../benchmark_dataset/watdiv.10M.nt"; + let workload = workload_queries(); + + match parse_large_ntriples_file(file_path, &workload) { + Ok(mut db) => { + println!("Successfully processed N-Triples file"); + run_all_queries(&mut db, &workload); + } + Err(e) => { + eprintln!("Error processing file '{}': {}", file_path, e); + println!( + "Make sure ../benchmark_dataset/watdiv.10M.nt exists." + ); + } + } +} \ No newline at end of file diff --git a/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh index c8ce8dc..e92c324 100755 --- a/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh +++ b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh @@ -25,6 +25,7 @@ INDEX_TYPES=( "sop" "spo" "table" + "buckets" ) echo "==============================================" From 98e217bb1c5c9f06783695874a86217833fb1131 Mon Sep 17 00:00:00 2001 From: Mirovh <94124714+Mirovh@users.noreply.github.com> Date: Mon, 23 Mar 2026 01:28:41 +0100 Subject: [PATCH 12/23] Implement buckets --- kolibrie/src/sparql_database.rs | 9 +- shared/src/index_manager/buckets.rs | 231 ++++++++++++++++++++++++++++ shared/src/index_manager/mod.rs | 9 +- 3 files changed, 247 insertions(+), 2 deletions(-) create mode 100644 shared/src/index_manager/buckets.rs diff --git a/kolibrie/src/sparql_database.rs b/kolibrie/src/sparql_database.rs index aa3994b..765e266 100644 --- a/kolibrie/src/sparql_database.rs +++ b/kolibrie/src/sparql_database.rs @@ -39,7 +39,7 @@ use url::Url; use crate::streamertail_optimizer::DatabaseStats; use shared::index_manager::{ IndexConfig, HexastoreIndex, SPOSingleIndex, POSSingleIndex, OSPSingleIndex, - PSOSingleIndex, OPSSingleIndex, SOPSingleIndex, SingleTableIndex, DynamicHexastoreIndex + PSOSingleIndex, OPSSingleIndex, SOPSingleIndex, SingleTableIndex, DynamicHexastoreIndex, BucketIndex }; use shared::terms::TriplePattern; use crate::parser::convert_triple_pattern; @@ -102,6 +102,7 @@ impl SparqlDatabase { // Pattern-dependent indexes start as hexastore; // `build_all_indexes` will swap them out. IndexConfig::DynamicHexastore { .. } => Box::new(HexastoreIndex::new()), + IndexConfig::Buckets { .. } => Box::new(HexastoreIndex::new()), } } @@ -151,6 +152,12 @@ impl SparqlDatabase { Box::new(DynamicHexastoreIndex::new(patterns, eval)) } + IndexConfig::Buckets { queries } => { + let patterns = self.resolve_query_patterns(queries); + print!("lmkjqdfkmjldfqmlkjqdfmljkqdfsmljkfqsdjlmk:{}", queries[0]); + Box::new(BucketIndex::new(patterns)) + } + // Future index types go here: // IndexConfig::YourNewIndex { some_param, queries } => { // let patterns = self.resolve_query_patterns(queries); diff --git a/shared/src/index_manager/buckets.rs b/shared/src/index_manager/buckets.rs new file mode 100644 index 0000000..5129d89 --- /dev/null +++ b/shared/src/index_manager/buckets.rs @@ -0,0 +1,231 @@ +use serde::{Serialize, Deserialize}; +use std::collections::HashSet; +use crate::terms::*; +use crate::triple::Triple; +use crate::index_manager::*; + +/// A single bucket mapped to a specific access pattern. +/// It only stores triples that match its designated `TriplePattern`. +#[derive(Debug, Clone)] +pub struct Bucket { + pub pattern: TriplePattern, + pub data: HashSet, +} + +impl Bucket { + pub fn new(pattern: TriplePattern) -> Self { + Self { + pattern, + data: HashSet::new(), + } + } + + /// Checks if a given triple matches this bucket's specific pattern. + pub fn matches(&self, triple: &Triple) -> bool { + let (s, p, o) = &self.pattern; + + let s_match = match s { Constant(c) => triple.subject == *c, Variable(_) => true }; + let p_match = match p { Constant(c) => triple.predicate == *c, Variable(_) => true }; + let o_match = match o { Constant(c) => triple.object == *c, Variable(_) => true }; + + s_match && p_match && o_match + } +} + +/// The main indexer that manages multiple isolated buckets. +#[derive(Debug, Clone)] +pub struct BucketIndex { + pub buckets: Vec, +} + +impl BucketIndex { + pub fn new(patterns: Vec) -> Self { + let mut unique_patterns: Vec = Vec::new(); + + // Deduplicate patterns before creating buckets. + // We consider any Variable equivalent to any other Variable, + // and Constants equivalent if their inner values match. + for p in patterns { + let is_duplicate = unique_patterns.iter().any(|existing| Self::patterns_equivalent(existing, &p)); + if !is_duplicate { + unique_patterns.push(p); + } + } + + println!("--- BucketIndex Initialization ---"); + println!("Requested patterns: {}, Unique buckets created: {}", unique_patterns.len(), unique_patterns.len()); + if unique_patterns.is_empty() { + println!("WARNING: BucketIndex initialized with 0 patterns! No data will be stored."); + } + + let buckets = unique_patterns.into_iter().enumerate().map(|(i, pat)| { + println!(" Bucket [{}]: {:?}", i, pat); + Bucket::new(pat) + }).collect(); + + Self { buckets } + } + + /// Helper to check if two TriplePatterns are semantically equivalent. + /// This prevents creating separate buckets for (?s, ?p, ?o) and (?x, ?y, ?z) + /// if the Variable inner IDs differ. + fn patterns_equivalent(p1: &TriplePattern, p2: &TriplePattern) -> bool { + let match_term = |t1: &Term, t2: &Term| -> bool { + match (t1, t2) { + (Constant(c1), Constant(c2)) => c1 == c2, + (Variable(_), Variable(_)) => true, // All variables are treated as equivalent "unbound" slots + _ => false, + } + }; + + match_term(&p1.0, &p2.0) && match_term(&p1.1, &p2.1) && match_term(&p1.2, &p2.2) + } + + /// Determines if a bucket's pattern "covers" the query pattern. + /// A bucket covers a query if the bucket is EQUAL to or MORE GENERAL than the query. + /// If the bucket is more specific than the query, it is unsafe to use. + fn bucket_covers_query(bucket_pat: &TriplePattern, q_s: Option, q_p: Option, q_o: Option) -> bool { + let (b_s, b_p, b_o) = bucket_pat; + + let s_safe = match b_s { Variable(_) => true, Constant(c) => q_s == Some(*c) }; + let p_safe = match b_p { Variable(_) => true, Constant(c) => q_p == Some(*c) }; + let o_safe = match b_o { Variable(_) => true, Constant(c) => q_o == Some(*c) }; + + s_safe && p_safe && o_safe + } + + /// Checks if a bucket pattern is an exact match for the query options. + fn is_exact_match(bucket_pat: &TriplePattern, q_s: Option, q_p: Option, q_o: Option) -> bool { + let (b_s, b_p, b_o) = bucket_pat; + + let s_match = match b_s { Constant(c) => q_s == Some(*c), Variable(_) => q_s.is_none() }; + let p_match = match b_p { Constant(c) => q_p == Some(*c), Variable(_) => q_p.is_none() }; + let o_match = match b_o { Constant(c) => q_o == Some(*c), Variable(_) => q_o.is_none() }; + + s_match && p_match && o_match + } +} + +impl TripleIndex for BucketIndex { + fn clone_empty(&self) -> Box { + // Re-create the index using the exact same bucket patterns, but empty. + let patterns = self.buckets.iter().map(|b| b.pattern.clone()).collect(); + Box::new(BucketIndex::new(patterns)) + } + + fn clone_box(&self) -> Box { + Box::new(self.clone()) + } + + fn triple_count(&self) -> usize { + // Because triples might be stored twice or more across different buckets, + // we must deduplicate them to get the true logical count. + let mut unique = HashSet::new(); + for bucket in &self.buckets { + for triple in &bucket.data { + unique.insert(triple.clone()); + } + } + unique.len() + } + + fn supported_access_patterns(&self) -> AccessPatternSupport { + AccessPatternSupport { sp: false, so: false, po: false, ps: false, os: false, op: false } + } + + fn insert(&mut self, triple: &Triple) -> bool { + let mut inserted_anywhere = false; + + for bucket in &mut self.buckets { + if bucket.matches(triple) { + if bucket.data.insert(triple.clone()) { + inserted_anywhere = true; + } + } + } + + inserted_anywhere + } + + fn delete(&mut self, triple: &Triple) -> bool { + let mut deleted_anywhere = false; + for bucket in &mut self.buckets { + if bucket.data.remove(triple) { + deleted_anywhere = true; + } + } + deleted_anywhere + } + + fn build_from_triples(&mut self, triples: &[Triple]) { + self.clear(); + + println!("Building BucketIndex with {} triples across {} buckets...", triples.len(), self.buckets.len()); + + if self.buckets.is_empty() { + println!("WARNING: Cannot build from triples because 0 buckets exist!"); + return; + } + + let mut insert_count = 0; + for triple in triples { + if self.insert(triple) { + insert_count += 1; + } + } + + println!("Finished building. {}/{} triples matched at least one bucket.", insert_count, triples.len()); + self.optimize(); + } + + fn query(&self, s: Option, p: Option, o: Option) -> Vec { + // Step 1: Look for an EXACT match bucket first (no extra filtering needed) + if let Some(exact_bucket) = self.buckets.iter().find(|b| Self::is_exact_match(&b.pattern, s, p, o)) { + return exact_bucket.data.iter().cloned().collect(); + } + + // Step 2: Look for a "covering" bucket (more general than the query) + if let Some(covering_bucket) = self.buckets.iter().find(|b| Self::bucket_covers_query(&b.pattern, s, p, o)) { + return covering_bucket.data.iter() + .filter(|t| { + (s.is_none() || s == Some(t.subject)) && + (p.is_none() || p == Some(t.predicate)) && + (o.is_none() || o == Some(t.object)) + }) + .cloned() + .collect(); + } + + // Step 3: If no bucket covers this query, it is unsafe. + eprintln!("Warning: Query {:?} {:?} {:?} is too general for the existing buckets. Returning empty.", s, p, o); + Vec::new() + } + + fn get_matching_triples(&self, pattern: &TriplePattern) -> Vec { + let (s, p, o) = pattern; + let sub = match s { Constant(x) => Some(*x), Variable(_) => None }; + let pre = match p { Constant(x) => Some(*x), Variable(_) => None }; + let obj = match o { Constant(x) => Some(*x), Variable(_) => None }; + + self.query(sub, pre, obj) + } + + fn clear(&mut self) { + for bucket in &mut self.buckets { + bucket.data.clear(); + } + } + + fn scan_sp(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_so(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_po(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_ps(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_os(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_op(&self, _: u32, _: u32) -> Option<&HashSet> { None } + + fn optimize(&mut self) { + for bucket in &mut self.buckets { + bucket.data.shrink_to_fit(); + } + } +} \ No newline at end of file diff --git a/shared/src/index_manager/mod.rs b/shared/src/index_manager/mod.rs index 2d37240..3985db8 100644 --- a/shared/src/index_manager/mod.rs +++ b/shared/src/index_manager/mod.rs @@ -22,6 +22,7 @@ pub use sop_single::SOPSingleIndex; pub use spo_single::SPOSingleIndex; pub use single_table::SingleTableIndex; pub use dynamic_hexastore::DynamicHexastoreIndex; +pub use buckets::BucketIndex; pub mod hexastore; pub mod ops_single; pub mod osp_single; @@ -31,6 +32,7 @@ pub mod sop_single; pub mod spo_single; pub mod single_table; pub mod dynamic_hexastore; +pub mod buckets; #[derive(Debug, Clone)] pub enum IndexConfig { @@ -61,6 +63,11 @@ pub enum IndexConfig { queries: Vec, }, + /// Buckets + Buckets { + queries: Vec, + } + // ── Future index types go here ── // YourNewIndex { // some_setting: usize, @@ -75,7 +82,7 @@ impl Default for IndexConfig { } /// Describes which access patterns an index can serve efficiently. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Default)] pub struct AccessPatternSupport { pub sp: bool, // subject+predicate -> objects pub so: bool, // subject+object -> predicates From 94436c4eea6e26f74c3dadcc2b723d312cad0273 Mon Sep 17 00:00:00 2001 From: Mirovh Date: Tue, 31 Mar 2026 16:48:58 +0200 Subject: [PATCH 13/23] dynamic hexastore fix and buckets improvements --- .../n_triples_data/n_triple_10M.rs | 4 +- .../n_triple_10M_all_indexes.sh | 2 +- kolibrie/src/sparql_database.rs | 1 - shared/src/index_manager/buckets.rs | 359 +++++++++++++----- shared/src/index_manager/dynamic_hexastore.rs | 198 +--------- 5 files changed, 285 insertions(+), 279 deletions(-) diff --git a/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M.rs b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M.rs index b6ae8a6..62baef4 100644 --- a/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M.rs +++ b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M.rs @@ -522,7 +522,7 @@ fn make_config_from_env(queries: Vec) -> (String, IndexConfig) { "ops" => IndexConfig::OPS, "sop" => IndexConfig::SOP, "table" => IndexConfig::SingleTable, - "dynamic" => IndexConfig::DynamicHexastore { + "dynamic_hexastore" => IndexConfig::DynamicHexastore { eval_interval: 1000, queries, }, @@ -644,4 +644,4 @@ fn main() { ); } } -} \ No newline at end of file +} diff --git a/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh index e92c324..328ee8e 100755 --- a/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh +++ b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh @@ -16,6 +16,7 @@ TIMESTAMP=$(date +%Y%m%d_%H%M%S) RESULT_DIR="${OUTPUT_DIR}/${TIMESTAMP}" INDEX_TYPES=( + "buckets" "dynamic_hexastore" "hexastore" "ops" @@ -25,7 +26,6 @@ INDEX_TYPES=( "sop" "spo" "table" - "buckets" ) echo "==============================================" diff --git a/kolibrie/src/sparql_database.rs b/kolibrie/src/sparql_database.rs index 765e266..3316e1f 100644 --- a/kolibrie/src/sparql_database.rs +++ b/kolibrie/src/sparql_database.rs @@ -154,7 +154,6 @@ impl SparqlDatabase { IndexConfig::Buckets { queries } => { let patterns = self.resolve_query_patterns(queries); - print!("lmkjqdfkmjldfqmlkjqdfmljkqdfsmljkfqsdjlmk:{}", queries[0]); Box::new(BucketIndex::new(patterns)) } diff --git a/shared/src/index_manager/buckets.rs b/shared/src/index_manager/buckets.rs index 5129d89..45351e4 100644 --- a/shared/src/index_manager/buckets.rs +++ b/shared/src/index_manager/buckets.rs @@ -1,38 +1,257 @@ use serde::{Serialize, Deserialize}; -use std::collections::HashSet; + +use std::collections::{HashSet, HashMap}; + use crate::terms::*; use crate::triple::Triple; use crate::index_manager::*; -/// A single bucket mapped to a specific access pattern. -/// It only stores triples that match its designated `TriplePattern`. +// ── helpers ────────────────────────────────────────────────────────────────── + +fn get_triple_field(triple: &Triple, pos: usize) -> u32 { + match pos { + 0 => triple.subject, + 1 => triple.predicate, + 2 => triple.object, + _ => panic!("invalid position {pos}"), + } +} + +fn is_one_constant_pattern(pattern: &TriplePattern) -> bool { + let (s, p, o) = pattern; + matches!( + (s, p, o), + (Constant(_), Variable(_), Variable(_)) + | (Variable(_), Constant(_), Variable(_)) + | (Variable(_), Variable(_), Constant(_)) + ) +} + +// ── TwoWayData ──────────────────────────────────────────────────────────────── + +/// Bidirectional index for buckets with exactly one bound constant. +/// +/// Given a pattern like `(?s, C, ?o)`: +/// - `pos_a = 0` (subject), `pos_b = 2` (object), `const_pos = 1`, `const_val = C` +/// - `forward`: subject → { objects … } +/// - `backward`: object → { subjects … } +/// +/// A query that binds `pos_a` (e.g. `bound_s, C, ?o`) is served by a single +/// `forward` lookup instead of iterating the whole bucket. +#[derive(Debug, Clone)] +pub struct TwoWayData { + pos_a: usize, + pos_b: usize, + const_pos: usize, + const_val: u32, + forward: HashMap>, // pos_a_val → { pos_b_val, … } + backward: HashMap>, // pos_b_val → { pos_a_val, … } +} + +impl TwoWayData { + fn from_pattern(pattern: &TriplePattern) -> Self { + let (s, p, o) = pattern; + let mut free = Vec::new(); + let mut const_pos = 0; + let mut const_val = 0u32; + + for (i, term) in [s, p, o].iter().enumerate() { + match term { + Variable(_) => free.push(i), + Constant(c) => { const_pos = i; const_val = *c; } + } + } + + assert_eq!(free.len(), 2, "TwoWayData requires exactly one constant"); + + Self { + pos_a: free[0], pos_b: free[1], + const_pos, const_val, + forward: HashMap::new(), backward: HashMap::new(), + } + } + + fn build_triple(&self, a: u32, b: u32) -> Triple { + let mut vals = [0u32; 3]; + vals[self.pos_a] = a; + vals[self.pos_b] = b; + vals[self.const_pos] = self.const_val; + Triple { subject: vals[0], predicate: vals[1], object: vals[2] } + } + + fn insert(&mut self, triple: &Triple) -> bool { + let a = get_triple_field(triple, self.pos_a); + let b = get_triple_field(triple, self.pos_b); + let inserted = self.forward.entry(a).or_default().insert(b); + self.backward.entry(b).or_default().insert(a); + inserted + } + + fn remove(&mut self, triple: &Triple) -> bool { + let a = get_triple_field(triple, self.pos_a); + let b = get_triple_field(triple, self.pos_b); + + let removed = if let Some(set) = self.forward.get_mut(&a) { + let r = set.remove(&b); + if set.is_empty() { self.forward.remove(&a); } + r + } else { false }; + + if removed { + if let Some(set) = self.backward.get_mut(&b) { + set.remove(&a); + if set.is_empty() { self.backward.remove(&b); } + } + } + + removed + } + + /// Query using `q[0..=2]` = `[s, p, o]` as `Option`. + /// The constant position is already guaranteed to match by the time we get here. + fn query(&self, q: [Option; 3]) -> Vec { + let qa = q[self.pos_a]; + let qb = q[self.pos_b]; + + match (qa, qb) { + // One free dimension bound → single hashmap lookup, O(output) + (Some(a), None) => { + self.forward.get(&a).map_or(Vec::new(), |bs| { + bs.iter().map(|&b| self.build_triple(a, b)).collect() + }) + } + (None, Some(b)) => { + self.backward.get(&b).map_or(Vec::new(), |as_| { + as_.iter().map(|&a| self.build_triple(a, b)).collect() + }) + } + // Both free dimensions bound → existence check + (Some(a), Some(b)) => { + if self.forward.get(&a).map_or(false, |bs| bs.contains(&b)) { + vec![self.build_triple(a, b)] + } else { + Vec::new() + } + } + // Nothing extra bound → dump everything + (None, None) => { + self.forward.iter() + .flat_map(|(&a, bs)| bs.iter().map(move |&b| (a, b))) + .map(|(a, b)| self.build_triple(a, b)) + .collect() + } + } + } + + fn triple_count(&self) -> usize { + self.forward.values().map(|s| s.len()).sum() + } + + fn clear(&mut self) { + self.forward.clear(); + self.backward.clear(); + } + + fn shrink_to_fit(&mut self) { + for s in self.forward.values_mut() { s.shrink_to_fit(); } + for s in self.backward.values_mut() { s.shrink_to_fit(); } + self.forward.shrink_to_fit(); + self.backward.shrink_to_fit(); + } +} + +// ── BucketData ──────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone)] +pub enum BucketData { + /// Patterns with 0, 2, or 3 constants – a flat set is fine. + Simple(HashSet), + /// Pattern with exactly 1 constant – bidirectional maps for O(output) lookups. + TwoWay(TwoWayData), +} + +// ── Bucket ──────────────────────────────────────────────────────────────────── + #[derive(Debug, Clone)] pub struct Bucket { pub pattern: TriplePattern, - pub data: HashSet, + pub data: BucketData, } impl Bucket { pub fn new(pattern: TriplePattern) -> Self { - Self { - pattern, - data: HashSet::new(), - } + let data = if is_one_constant_pattern(&pattern) { + BucketData::TwoWay(TwoWayData::from_pattern(&pattern)) + } else { + BucketData::Simple(HashSet::new()) + }; + Self { pattern, data } } - /// Checks if a given triple matches this bucket's specific pattern. pub fn matches(&self, triple: &Triple) -> bool { let (s, p, o) = &self.pattern; - - let s_match = match s { Constant(c) => triple.subject == *c, Variable(_) => true }; - let p_match = match p { Constant(c) => triple.predicate == *c, Variable(_) => true }; - let o_match = match o { Constant(c) => triple.object == *c, Variable(_) => true }; - - s_match && p_match && o_match + let s_ok = match s { Constant(c) => triple.subject == *c, Variable(_) => true }; + let p_ok = match p { Constant(c) => triple.predicate == *c, Variable(_) => true }; + let o_ok = match o { Constant(c) => triple.object == *c, Variable(_) => true }; + s_ok && p_ok && o_ok + } + + pub fn insert(&mut self, triple: &Triple) -> bool { + match &mut self.data { + BucketData::Simple(set) => set.insert(triple.clone()), + BucketData::TwoWay(tw) => tw.insert(triple), + } + } + + pub fn remove(&mut self, triple: &Triple) -> bool { + match &mut self.data { + BucketData::Simple(set) => set.remove(triple), + BucketData::TwoWay(tw) => tw.remove(triple), + } + } + + pub fn triple_count(&self) -> usize { + match &self.data { + BucketData::Simple(set) => set.len(), + BucketData::TwoWay(tw) => tw.triple_count(), + } + } + + pub fn clear(&mut self) { + match &mut self.data { + BucketData::Simple(set) => set.clear(), + BucketData::TwoWay(tw) => tw.clear(), + } + } + + pub fn shrink_to_fit(&mut self) { + match &mut self.data { + BucketData::Simple(set) => set.shrink_to_fit(), + BucketData::TwoWay(tw) => tw.shrink_to_fit(), + } + } + + /// Return triples that match the given optional bindings. + /// Callers must ensure the bucket covers the query (i.e. `bucket_covers_query` passed). + pub fn query(&self, s: Option, p: Option, o: Option) -> Vec { + match &self.data { + BucketData::Simple(set) => { + set.iter() + .filter(|t| { + (s.is_none() || s == Some(t.subject)) && + (p.is_none() || p == Some(t.predicate)) && + (o.is_none() || o == Some(t.object)) + }) + .cloned() + .collect() + } + BucketData::TwoWay(tw) => tw.query([s, p, o]), + } } } -/// The main indexer that manages multiple isolated buckets. +// ── BucketIndex ─────────────────────────────────────────────────────────────── + #[derive(Debug, Clone)] pub struct BucketIndex { pub buckets: Vec, @@ -42,14 +261,9 @@ impl BucketIndex { pub fn new(patterns: Vec) -> Self { let mut unique_patterns: Vec = Vec::new(); - // Deduplicate patterns before creating buckets. - // We consider any Variable equivalent to any other Variable, - // and Constants equivalent if their inner values match. for p in patterns { - let is_duplicate = unique_patterns.iter().any(|existing| Self::patterns_equivalent(existing, &p)); - if !is_duplicate { - unique_patterns.push(p); - } + let is_dup = unique_patterns.iter().any(|e| Self::patterns_equivalent(e, &p)); + if !is_dup { unique_patterns.push(p); } } println!("--- BucketIndex Initialization ---"); @@ -66,49 +280,34 @@ impl BucketIndex { Self { buckets } } - /// Helper to check if two TriplePatterns are semantically equivalent. - /// This prevents creating separate buckets for (?s, ?p, ?o) and (?x, ?y, ?z) - /// if the Variable inner IDs differ. fn patterns_equivalent(p1: &TriplePattern, p2: &TriplePattern) -> bool { - let match_term = |t1: &Term, t2: &Term| -> bool { - match (t1, t2) { - (Constant(c1), Constant(c2)) => c1 == c2, - (Variable(_), Variable(_)) => true, // All variables are treated as equivalent "unbound" slots - _ => false, - } + let match_term = |t1: &Term, t2: &Term| match (t1, t2) { + (Constant(c1), Constant(c2)) => c1 == c2, + (Variable(_), Variable(_)) => true, + _ => false, }; - match_term(&p1.0, &p2.0) && match_term(&p1.1, &p2.1) && match_term(&p1.2, &p2.2) } - /// Determines if a bucket's pattern "covers" the query pattern. - /// A bucket covers a query if the bucket is EQUAL to or MORE GENERAL than the query. - /// If the bucket is more specific than the query, it is unsafe to use. fn bucket_covers_query(bucket_pat: &TriplePattern, q_s: Option, q_p: Option, q_o: Option) -> bool { let (b_s, b_p, b_o) = bucket_pat; - let s_safe = match b_s { Variable(_) => true, Constant(c) => q_s == Some(*c) }; let p_safe = match b_p { Variable(_) => true, Constant(c) => q_p == Some(*c) }; let o_safe = match b_o { Variable(_) => true, Constant(c) => q_o == Some(*c) }; - s_safe && p_safe && o_safe } - /// Checks if a bucket pattern is an exact match for the query options. fn is_exact_match(bucket_pat: &TriplePattern, q_s: Option, q_p: Option, q_o: Option) -> bool { let (b_s, b_p, b_o) = bucket_pat; - - let s_match = match b_s { Constant(c) => q_s == Some(*c), Variable(_) => q_s.is_none() }; - let p_match = match b_p { Constant(c) => q_p == Some(*c), Variable(_) => q_p.is_none() }; - let o_match = match b_o { Constant(c) => q_o == Some(*c), Variable(_) => q_o.is_none() }; - - s_match && p_match && o_match + let s_ok = match b_s { Constant(c) => q_s == Some(*c), Variable(_) => q_s.is_none() }; + let p_ok = match b_p { Constant(c) => q_p == Some(*c), Variable(_) => q_p.is_none() }; + let o_ok = match b_o { Constant(c) => q_o == Some(*c), Variable(_) => q_o.is_none() }; + s_ok && p_ok && o_ok } } impl TripleIndex for BucketIndex { fn clone_empty(&self) -> Box { - // Re-create the index using the exact same bucket patterns, but empty. let patterns = self.buckets.iter().map(|b| b.pattern.clone()).collect(); Box::new(BucketIndex::new(patterns)) } @@ -118,13 +317,13 @@ impl TripleIndex for BucketIndex { } fn triple_count(&self) -> usize { - // Because triples might be stored twice or more across different buckets, - // we must deduplicate them to get the true logical count. - let mut unique = HashSet::new(); + // Buckets may overlap, so deduplicate. + // For TwoWay buckets we reconstruct triples on the fly; this is the one + // place where the extra memory of TwoWay costs a bit more to count. + let mut unique: HashSet = HashSet::new(); for bucket in &self.buckets { - for triple in &bucket.data { - unique.insert(triple.clone()); - } + // query(None,None,None) works for both Simple and TwoWay + unique.extend(bucket.query(None, None, None)); } unique.len() } @@ -135,22 +334,18 @@ impl TripleIndex for BucketIndex { fn insert(&mut self, triple: &Triple) -> bool { let mut inserted_anywhere = false; - for bucket in &mut self.buckets { - if bucket.matches(triple) { - if bucket.data.insert(triple.clone()) { - inserted_anywhere = true; - } + if bucket.matches(triple) && bucket.insert(triple) { + inserted_anywhere = true; } } - inserted_anywhere } fn delete(&mut self, triple: &Triple) -> bool { let mut deleted_anywhere = false; for bucket in &mut self.buckets { - if bucket.data.remove(triple) { + if bucket.remove(triple) { deleted_anywhere = true; } } @@ -159,9 +354,9 @@ impl TripleIndex for BucketIndex { fn build_from_triples(&mut self, triples: &[Triple]) { self.clear(); - + println!("Building BucketIndex with {} triples across {} buckets...", triples.len(), self.buckets.len()); - + if self.buckets.is_empty() { println!("WARNING: Cannot build from triples because 0 buckets exist!"); return; @@ -169,34 +364,25 @@ impl TripleIndex for BucketIndex { let mut insert_count = 0; for triple in triples { - if self.insert(triple) { - insert_count += 1; - } + if self.insert(triple) { insert_count += 1; } } - + println!("Finished building. {}/{} triples matched at least one bucket.", insert_count, triples.len()); self.optimize(); } fn query(&self, s: Option, p: Option, o: Option) -> Vec { - // Step 1: Look for an EXACT match bucket first (no extra filtering needed) - if let Some(exact_bucket) = self.buckets.iter().find(|b| Self::is_exact_match(&b.pattern, s, p, o)) { - return exact_bucket.data.iter().cloned().collect(); - } - - // Step 2: Look for a "covering" bucket (more general than the query) - if let Some(covering_bucket) = self.buckets.iter().find(|b| Self::bucket_covers_query(&b.pattern, s, p, o)) { - return covering_bucket.data.iter() - .filter(|t| { - (s.is_none() || s == Some(t.subject)) && - (p.is_none() || p == Some(t.predicate)) && - (o.is_none() || o == Some(t.object)) - }) - .cloned() - .collect(); + // Exact match: the bucket pattern mirrors the query exactly, no extra filtering. + if let Some(b) = self.buckets.iter().find(|b| Self::is_exact_match(&b.pattern, s, p, o)) { + return b.query(s, p, o); + } + + // Covering match: bucket is more general; Bucket::query() handles the filtering, + // and TwoWay buckets do it in O(output) via a hashmap lookup. + if let Some(b) = self.buckets.iter().find(|b| Self::bucket_covers_query(&b.pattern, s, p, o)) { + return b.query(s, p, o); } - // Step 3: If no bucket covers this query, it is unsafe. eprintln!("Warning: Query {:?} {:?} {:?} is too general for the existing buckets. Returning empty.", s, p, o); Vec::new() } @@ -206,14 +392,11 @@ impl TripleIndex for BucketIndex { let sub = match s { Constant(x) => Some(*x), Variable(_) => None }; let pre = match p { Constant(x) => Some(*x), Variable(_) => None }; let obj = match o { Constant(x) => Some(*x), Variable(_) => None }; - self.query(sub, pre, obj) } fn clear(&mut self) { - for bucket in &mut self.buckets { - bucket.data.clear(); - } + for bucket in &mut self.buckets { bucket.clear(); } } fn scan_sp(&self, _: u32, _: u32) -> Option<&HashSet> { None } @@ -224,8 +407,6 @@ impl TripleIndex for BucketIndex { fn scan_op(&self, _: u32, _: u32) -> Option<&HashSet> { None } fn optimize(&mut self) { - for bucket in &mut self.buckets { - bucket.data.shrink_to_fit(); - } + for bucket in &mut self.buckets { bucket.shrink_to_fit(); } } -} \ No newline at end of file +} diff --git a/shared/src/index_manager/dynamic_hexastore.rs b/shared/src/index_manager/dynamic_hexastore.rs index b2bad25..68f28f4 100644 --- a/shared/src/index_manager/dynamic_hexastore.rs +++ b/shared/src/index_manager/dynamic_hexastore.rs @@ -174,6 +174,7 @@ pub struct CardinalitySnapshot { pub num_subjects: f64, pub num_predicates: f64, pub num_objects: f64, + pub num_sp_pairs: f64, } impl CardinalitySnapshot { @@ -183,18 +184,20 @@ impl CardinalitySnapshot { unique_subjects: usize, unique_predicates: usize, unique_objects: usize, + unique_sp_pairs: usize, ) -> Self { // Use at least 1.0 to avoid division-by-zero in cost formulas Self { num_subjects: (unique_subjects as f64).max(1.0), num_predicates: (unique_predicates as f64).max(1.0), num_objects: (unique_objects as f64).max(1.0), + num_sp_pairs: (unique_sp_pairs as f64).max(1.0), } } /// A default snapshot when we have no data yet. fn unknown() -> Self { - Self { num_subjects: 1.0, num_predicates: 1.0, num_objects: 1.0 } + Self { num_subjects: 1.0, num_predicates: 1.0, num_objects: 1.0, num_sp_pairs: 1.0 } } } @@ -686,6 +689,7 @@ impl DynamicHexastoreIndex { self.inserts_since_eval = 0; self.reevaluate(); } + println!("subj {} pred {} obj {}", self.latest_card.num_subjects, self.latest_card.num_predicates, self.latest_card.num_objects); } } @@ -744,7 +748,6 @@ impl TripleIndex for DynamicHexastoreIndex { any_new = true; } } - if any_new { self.maybe_reevaluate(); } @@ -777,22 +780,29 @@ impl TripleIndex for DynamicHexastoreIndex { pool.insert(triple); } } + for pool in &mut self.pools { + println!("pool {:?}", pool.desired_type); + } // After bulk load, gather rough cardinalities and reevaluate let mut subjects = HashSet::new(); let mut predicates = HashSet::new(); let mut objects = HashSet::new(); + let mut sp_pairs = HashSet::new(); for t in triples { subjects.insert(t.subject); predicates.insert(t.predicate); objects.insert(t.object); + sp_pairs.insert((t.subject, t.predicate)); } self.latest_card = CardinalitySnapshot::from_stats( triples.len() as u64, subjects.len(), predicates.len(), objects.len(), + sp_pairs.len(), ); self.reevaluate(); + println!("subj {} pred {} obj {} sp {}", self.latest_card.num_subjects, self.latest_card.num_predicates, self.latest_card.num_objects, self.latest_card.num_sp_pairs); } // ── Query ─────────────────────────────────────────────────────────── @@ -900,187 +910,3 @@ impl TripleIndex for DynamicHexastoreIndex { } } } - -// ─── Tests ────────────────────────────────────────────────────────────────── - -#[cfg(test)] -mod tests { - use super::*; - use crate::triple::Triple; - - fn make_triple(s: u32, p: u32, o: u32) -> Triple { - Triple { subject: s, predicate: p, object: o } - } - - #[test] - fn test_insert_and_query_basic() { - // Access pattern: (?s, p=1, ?o) — only predicate bound - let patterns = vec![ - (Term::Variable("s".into()), Term::Constant(1), Term::Variable("o".into())), - ]; - let mut idx = DynamicHexastoreIndex::with_patterns(patterns); - - assert!(idx.insert(&make_triple(10, 1, 100))); - assert!(idx.insert(&make_triple(20, 1, 200))); - assert!(idx.insert(&make_triple(30, 2, 300))); - - let result = idx.query(None, Some(1), None); - assert_eq!(result.len(), 2); - - let result_all = idx.query(None, None, None); - assert_eq!(result_all.len(), 3); - } - - #[test] - fn test_delete_removes_from_all_pools() { - let patterns = vec![ - (Term::Constant(1), Term::Variable("p".into()), Term::Variable("o".into())), - (Term::Variable("s".into()), Term::Constant(2), Term::Variable("o".into())), - ]; - let mut idx = DynamicHexastoreIndex::with_patterns(patterns); - - idx.insert(&make_triple(1, 2, 3)); - idx.insert(&make_triple(1, 2, 4)); - assert_eq!(idx.triple_count(), 2); - - assert!(idx.delete(&make_triple(1, 2, 3))); - assert_eq!(idx.triple_count(), 1); - - // Deleting non-existent triple returns false - assert!(!idx.delete(&make_triple(99, 99, 99))); - } - - #[test] - fn test_duplicate_insert_returns_false() { - let patterns = vec![ - (Term::Variable("s".into()), Term::Variable("p".into()), Term::Variable("o".into())), - ]; - let mut idx = DynamicHexastoreIndex::with_patterns(patterns); - - assert!(idx.insert(&make_triple(1, 2, 3))); - assert!(!idx.insert(&make_triple(1, 2, 3))); - assert_eq!(idx.triple_count(), 1); - } - - #[test] - fn test_build_from_triples() { - let patterns = vec![ - (Term::Variable("s".into()), Term::Constant(1), Term::Variable("o".into())), - (Term::Constant(10), Term::Variable("p".into()), Term::Variable("o".into())), - ]; - let mut idx = DynamicHexastoreIndex::with_patterns(patterns); - - let triples: Vec = (0..100) - .map(|i| make_triple(i % 10, i % 5, i)) - .collect(); - idx.build_from_triples(&triples); - - assert_eq!(idx.triple_count(), 100); - - // Query specific predicate - let p1 = idx.query(None, Some(1), None); - assert_eq!(p1.len(), 20); // i % 5 == 1 for i=1,6,11,...,96 → 20 triples - } - - #[test] - fn test_clear() { - let patterns = vec![ - (Term::Variable("s".into()), Term::Variable("p".into()), Term::Variable("o".into())), - ]; - let mut idx = DynamicHexastoreIndex::with_patterns(patterns); - idx.insert(&make_triple(1, 2, 3)); - idx.insert(&make_triple(4, 5, 6)); - assert_eq!(idx.triple_count(), 2); - - idx.clear(); - assert_eq!(idx.triple_count(), 0); - } - - #[test] - fn test_get_matching_triples() { - let patterns = vec![ - (Term::Constant(1), Term::Constant(2), Term::Variable("o".into())), - ]; - let mut idx = DynamicHexastoreIndex::with_patterns(patterns); - idx.insert(&make_triple(1, 2, 10)); - idx.insert(&make_triple(1, 2, 20)); - idx.insert(&make_triple(1, 3, 30)); - - let pat = (Term::Constant(1), Term::Constant(2), Term::Variable("o".into())); - let result = idx.get_matching_triples(&pat); - assert_eq!(result.len(), 2); - } - - #[test] - fn test_reevaluate_does_not_lose_data() { - let patterns = vec![ - (Term::Variable("s".into()), Term::Constant(1), Term::Variable("o".into())), - ]; - let mut idx = DynamicHexastoreIndex::new(patterns, 5); - - // Insert enough to trigger re-evaluation - for i in 0..20 { - idx.insert(&make_triple(i, 1, i * 10)); - } - // Update cardinalities and force reevaluate - idx.update_cardinalities(CardinalitySnapshot { - num_subjects: 20.0, - num_predicates: 1.0, - num_objects: 20.0, - }); - idx.reevaluate(); - - // Data should still be there - assert_eq!(idx.triple_count(), 20); - let result = idx.query(None, Some(1), None); - assert_eq!(result.len(), 20); - } - - #[test] - fn test_supported_access_patterns() { - // Pattern needs SP scan �� should report sp=true - let patterns = vec![ - (Term::Constant(1), Term::Constant(2), Term::Variable("o".into())), - ]; - let idx = DynamicHexastoreIndex::with_patterns(patterns); - let support = idx.supported_access_patterns(); - // At least one of SP or PS should be supported - assert!(support.sp || support.ps); - } - - #[test] - fn test_cost_functions() { - let card = CardinalitySnapshot { - num_subjects: 100.0, - num_predicates: 10.0, - num_objects: 50.0, - }; - - // Querying SPO for bound s,p should be cheap (2 lookups) - let cost_sp = query_cost(IndexType::SPO, BoundPattern { s: true, p: true, o: false }, &card); - assert_eq!(cost_sp, 2.0); - - // Querying SPO for bound o only should be expensive (full scan) - let cost_o = query_cost(IndexType::SPO, BoundPattern { s: false, p: false, o: true }, &card); - assert!(cost_o > 10.0); - - // OPS should be cheap for bound o - let cost_o_ops = query_cost(IndexType::OPS, BoundPattern { s: false, p: false, o: true }, &card); - assert!(cost_o_ops < cost_o); - } - - #[test] - fn test_clone_empty_and_clone_box() { - let patterns = vec![ - (Term::Variable("s".into()), Term::Constant(1), Term::Variable("o".into())), - ]; - let mut idx = DynamicHexastoreIndex::with_patterns(patterns); - idx.insert(&make_triple(1, 1, 10)); - - let empty = idx.clone_empty(); - assert_eq!(empty.triple_count(), 0); - - let cloned = idx.clone_box(); - assert_eq!(cloned.triple_count(), 1); - } -} From 9c5c302499765aa19a25d8d39e6cd78f0e77145a Mon Sep 17 00:00:00 2001 From: Mirovh Date: Tue, 31 Mar 2026 16:57:15 +0200 Subject: [PATCH 14/23] fix gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 2eb1831..5ed561b 100644 --- a/.gitignore +++ b/.gitignore @@ -86,6 +86,7 @@ python/.venv/ # Some other directories benchmark_dataset/ kolibrie/examples/sparql_syntax/n_triples_data/benchmark_results/ +!kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh # IntelliJ .idea/ From 16547bcbc070d7b18bec92207b194e8aaa1e8330 Mon Sep 17 00:00:00 2001 From: Mirovh <94124714+Mirovh@users.noreply.github.com> Date: Tue, 7 Apr 2026 07:27:47 +0200 Subject: [PATCH 15/23] Physical plan buckets --- .../n_triple_10M_all_indexes.sh | 4 +- .../sparql_syntax/n_triples_data/temp.txt | 451 ++++++++++++++++++ 2 files changed, 453 insertions(+), 2 deletions(-) create mode 100644 kolibrie/examples/sparql_syntax/n_triples_data/temp.txt diff --git a/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh index 328ee8e..76b6894 100755 --- a/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh +++ b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh @@ -17,12 +17,12 @@ RESULT_DIR="${OUTPUT_DIR}/${TIMESTAMP}" INDEX_TYPES=( "buckets" - "dynamic_hexastore" + "pso" "hexastore" + "dynamic_hexastore" "ops" "osp" "pos" - "pso" "sop" "spo" "table" diff --git a/kolibrie/examples/sparql_syntax/n_triples_data/temp.txt b/kolibrie/examples/sparql_syntax/n_triples_data/temp.txt new file mode 100644 index 0000000..dbe5c39 --- /dev/null +++ b/kolibrie/examples/sparql_syntax/n_triples_data/temp.txt @@ -0,0 +1,451 @@ +============================================== + Kolibrie Index Benchmark Runner +============================================== +Output directory: ./benchmark_results/20260407_065605 +Index types: buckets pso hexastore dynamic_hexastore ops osp pos sop spo table +============================================== + +[BUILD] Compiling in release mode... +warning: unused variable: `i` + --> shared\src\index_manager\buckets.rs:219:14 + | +219 | for (i, b) in self.buckets.iter().enumerate() { + | ^ help: if this is intentional, prefix it with an underscore: `_i` + | + = note: `#[warn(unused_variables)]` (part of `#[warn(unused)]`) on by default + +warning: method `insert_optimized` is never used + --> shared\src\index_manager\single_table.rs:141:6 + | +129 | impl SingleTableIndex { + | --------------------- method in this implementation +... +141 | fn insert_optimized(&mut self, triple: &Triple) -> bool { + | ^^^^^^^^^^^^^^^^ + | + = note: `#[warn(dead_code)]` (part of `#[warn(unused)]`) on by default + +warning: method `merge_scan` is never used + --> shared\src\index_manager\dynamic_hexastore.rs:409:8 + | +316 | impl IndexPool { + | -------------- method in this implementation +... +409 | fn merge_scan(&self, scan_fn: F) -> Option> + | ^^^^^^^^^^ + +warning: `shared` (lib) generated 3 warnings (run `cargo fix --lib -p shared` to apply 1 suggestion) +warning: `shared` (lib) generated 3 warnings (3 duplicates) + Compiling kolibrie v0.1.1 (C:\Users\mirov\Documents\1.Bestanden\Kolibrie\kolibrie) +warning: unused import: `shared::index_manager::TripleIndex` + --> datalog\src\reasoning_experimental.rs:13:5 + | +13 | use shared::index_manager::TripleIndex; + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + | + = note: `#[warn(unused_imports)]` (part of `#[warn(unused)]`) on by default + +warning: `datalog` (lib) generated 1 warning +warning: unused import: `shared::index_manager::TripleIndex` + --> kolibrie\src\storage_manager.rs:16:5 + | +16 | use shared::index_manager::TripleIndex; + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + | + = note: `#[warn(unused_imports)]` (part of `#[warn(unused)]`) on by default + +warning: `kolibrie` (lib) generated 1 warning + Finished `release` profile [optimized + debuginfo] target(s) in 24.04s +[BUILD] Done. + +============================================== +[RUN] INDEX_TYPE=buckets + Output: ./benchmark_results/20260407_065605/buckets.txt +============================================== +warning: unused variable: `i` + --> shared\src\index_manager\buckets.rs:219:14 + | +219 | for (i, b) in self.buckets.iter().enumerate() { + | ^ help: if this is intentional, prefix it with an underscore: `_i` + | + = note: `#[warn(unused_variables)]` (part of `#[warn(unused)]`) on by default + +warning: method `insert_optimized` is never used + --> shared\src\index_manager\single_table.rs:141:6 + | +129 | impl SingleTableIndex { + | --------------------- method in this implementation +... +141 | fn insert_optimized(&mut self, triple: &Triple) -> bool { + | ^^^^^^^^^^^^^^^^ + | + = note: `#[warn(dead_code)]` (part of `#[warn(unused)]`) on by default + +warning: method `merge_scan` is never used + --> shared\src\index_manager\dynamic_hexastore.rs:409:8 + | +316 | impl IndexPool { + | -------------- method in this implementation +... +409 | fn merge_scan(&self, scan_fn: F) -> Option> + | ^^^^^^^^^^ + +warning: `shared` (lib) generated 3 warnings (run `cargo fix --lib -p shared` to apply 1 suggestion) +warning: `shared` (lib) generated 3 warnings (3 duplicates) +warning: unused import: `shared::index_manager::TripleIndex` + --> datalog\src\reasoning_experimental.rs:13:5 + | +13 | use shared::index_manager::TripleIndex; + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + | + = note: `#[warn(unused_imports)]` (part of `#[warn(unused)]`) on by default + + Compiling kolibrie v0.1.1 (C:\Users\mirov\Documents\1.Bestanden\Kolibrie\kolibrie) +warning: `datalog` (lib) generated 1 warning +warning: unused import: `shared::index_manager::TripleIndex` + --> kolibrie\src\storage_manager.rs:16:5 + | +16 | use shared::index_manager::TripleIndex; + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + | + = note: `#[warn(unused_imports)]` (part of `#[warn(unused)]`) on by default + +warning: `kolibrie` (lib) generated 1 warning + Finished `release` profile [optimized + debuginfo] target(s) in 23.97s + Running `C:\Users\mirov\Documents\1.Bestanden\Kolibrie\target\release\examples\n_triple_10M.exe` +INDEX_TYPE = buckets +Starting to parse N-Triples file: ../benchmark_dataset/watdiv.10M.nt +Processed 100000 triples +Processed 200000 triples +Processed 300000 triples +Processed 400000 triples +Processed 500000 triples +Processed 600000 triples +Processed 700000 triples +Processed 800000 triples +Processed 900000 triples +Processed 1000000 triples +Processed 1100000 triples +Processed 1200000 triples +Processed 1300000 triples +Processed 1400000 triples +Processed 1500000 triples +Processed 1600000 triples +Processed 1700000 triples +Processed 1800000 triples +Processed 1900000 triples +Processed 2000000 triples +Processed 2100000 triples +Processed 2200000 triples +Processed 2300000 triples +Processed 2400000 triples +Processed 2500000 triples +Processed 2600000 triples +Processed 2700000 triples +Processed 2800000 triples +Processed 2900000 triples +Processed 3000000 triples +Processed 3100000 triples +Processed 3200000 triples +Processed 3300000 triples +Processed 3400000 triples +Processed 3500000 triples +Processed 3600000 triples +Processed 3700000 triples +Processed 3800000 triples +Processed 3900000 triples +Processed 4000000 triples +Processed 4100000 triples +Processed 4200000 triples +Processed 4300000 triples +Processed 4400000 triples +Processed 4500000 triples +Processed 4600000 triples +Processed 4700000 triples +Processed 4800000 triples +Processed 4900000 triples +Processed 5000000 triples +Processed 5100000 triples +Processed 5200000 triples +Processed 5300000 triples +Processed 5400000 triples +Processed 5500000 triples +Processed 5600000 triples +Processed 5700000 triples +Processed 5800000 triples +Processed 5900000 triples +Processed 6000000 triples +Processed 6100000 triples +Processed 6200000 triples +Processed 6300000 triples +Processed 6400000 triples +Processed 6500000 triples +Processed 6600000 triples +Processed 6700000 triples +Processed 6800000 triples +Processed 6900000 triples +Processed 7000000 triples +Processed 7100000 triples +Processed 7200000 triples +Processed 7300000 triples +Processed 7400000 triples +Processed 7500000 triples +Processed 7600000 triples +Processed 7700000 triples +Processed 7800000 triples +Processed 7900000 triples +Processed 8000000 triples +Processed 8100000 triples +Processed 8200000 triples +Processed 8300000 triples +Processed 8400000 triples +Processed 8500000 triples +Processed 8600000 triples +Processed 8700000 triples +Processed 8800000 triples +Processed 8900000 triples +Processed 9000000 triples +Processed 9100000 triples +Processed 9200000 triples +Processed 9300000 triples +Processed 9400000 triples +Processed 9500000 triples +Processed 9600000 triples +Processed 9700000 triples +Processed 9800000 triples +Processed 9900000 triples +Processed 10000000 triples +Processed 10100000 triples +Processed 10200000 triples +Processed 10300000 triples +Processed 10400000 triples +Processed 10500000 triples +Processed 10600000 triples +Processed 10700000 triples +Processed 10800000 triples +Processed 10900000 triples +Finished parsing 10916457 triples in 14.98 seconds +Building indexes... +[Bucket Debug] --- BucketIndex Initialization --- +[Bucket Debug] Requested planned patterns: 103 +[Bucket Debug] Bucket [0]: Pattern: (Variable("?v0"), Constant(206061), Variable("?v1")), Bound: s=false, p=true, o=false +[Bucket Debug] Bucket [1]: Pattern: (Variable("?v0"), Constant(205904), Variable("?v2")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [2]: Pattern: (Variable("?v0"), Constant(205881), Variable("?v3")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [3]: Pattern: (Variable("?v0"), Constant(205988), Variable("?v4")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [4]: Pattern: (Variable("?v4"), Constant(601766), Variable("?v5")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [5]: Pattern: (Variable("?v4"), Constant(601764), Variable("?v6")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [6]: Pattern: (Variable("?v7"), Constant(206401), Variable("?v6")), Bound: s=false, p=true, o=true +[Bucket Debug] Bucket [7]: Pattern: (Variable("?v7"), Constant(206079), Variable("?v8")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [8]: Pattern: (Variable("?v2"), Constant(287), Constant(22)), Bound: s=false, p=true, o=true +[Bucket Debug] Bucket [9]: Pattern: (Variable("?v0"), Constant(593640), Variable("?v2")), Bound: s=false, p=true, o=true +[Bucket Debug] Bucket [10]: Pattern: (Variable("?v2"), Constant(266), Variable("?v3")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [11]: Pattern: (Variable("?v4"), Constant(779978), Variable("?v5")), Bound: s=false, p=true, o=false +[Bucket Debug] Bucket [12]: Pattern: (Variable("?v4"), Constant(205883), Variable("?v6")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [13]: Pattern: (Variable("?v4"), Constant(780739), Variable("?v7")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [14]: Pattern: (Variable("?v7"), Constant(443313), Variable("?v3")), Bound: s=true, p=true, o=true +[Bucket Debug] Bucket [15]: Pattern: (Variable("?v3"), Constant(205988), Variable("?v8")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [16]: Pattern: (Variable("?v0"), Constant(593711), Variable("?v1")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [17]: Pattern: (Variable("?v8"), Constant(601879), Variable("?v9")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [18]: Pattern: (Variable("?v0"), Constant(779975), Variable("?v1")), Bound: s=false, p=true, o=false +[Bucket Debug] Bucket [19]: Pattern: (Variable("?v0"), Constant(779930), Variable("?v2")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [20]: Pattern: (Variable("?v0"), Constant(779874), Variable("?v3")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [21]: Pattern: (Variable("?v0"), Constant(779919), Variable("?v4")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [22]: Pattern: (Variable("?v0"), Constant(779869), Variable("?v5")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [23]: Pattern: (Variable("?v0"), Constant(779922), Variable("?v6")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [24]: Pattern: (Variable("?v3"), Constant(205876), Constant(205978)), Bound: s=false, p=true, o=true +[Bucket Debug] Bucket [25]: Pattern: (Variable("?v3"), Constant(208830), Variable("?v4")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [26]: Pattern: (Variable("?v3"), Constant(205954), Variable("?v5")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [27]: Pattern: (Variable("?v3"), Constant(205867), Variable("?v0")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [28]: Pattern: (Variable("?v0"), Constant(205887), Constant(205986)), Bound: s=true, p=true, o=true +[Bucket Debug] Bucket [29]: Pattern: (Variable("?v0"), Constant(205876), Variable("?v2")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [30]: Pattern: (Variable("?v0"), Constant(205867), Constant(209611)), Bound: s=false, p=true, o=true +[Bucket Debug] Bucket [31]: Pattern: (Variable("?v0"), Constant(205883), Variable("?v1")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [32]: Pattern: (Variable("?v0"), Constant(205872), Variable("?v2")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [33]: Pattern: (Variable("?v0"), Constant(1052657), Variable("?v3")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [34]: Pattern: (Variable("?v0"), Constant(206061), Variable("?v4")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [35]: Pattern: (Variable("?v0"), Constant(205874), Variable("?v5")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [36]: Pattern: (Variable("?v1"), Constant(1045092), Variable("?v6")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [37]: Pattern: (Variable("?v1"), Constant(1045090), Variable("?v7")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [38]: Pattern: (Variable("?v0"), Constant(205867), Constant(254774)), Bound: s=false, p=true, o=true +[Bucket Debug] Bucket [39]: Pattern: (Variable("?v0"), Constant(205881), Variable("?v1")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [40]: Pattern: (Variable("?v0"), Constant(205901), Variable("?v2")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [41]: Pattern: (Variable("?v5"), Constant(443313), Variable("?v0")), Bound: s=false, p=true, o=true +[Bucket Debug] Bucket [42]: Pattern: (Variable("?v4"), Constant(780739), Variable("?v5")), Bound: s=false, p=true, o=true +[Bucket Debug] Bucket [43]: Pattern: (Variable("?v5"), Constant(443311), Variable("?v6")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [44]: Pattern: (Variable("?v0"), Constant(205887), Constant(205942)), Bound: s=false, p=true, o=true +[Bucket Debug] Bucket [45]: Pattern: (Variable("?v0"), Constant(205883), Variable("?v1")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [46]: Pattern: (Variable("?v2"), Constant(266), Variable("?v0")), Bound: s=false, p=true, o=true +[Bucket Debug] Bucket [47]: Pattern: (Variable("?v0"), Constant(205874), Variable("?v4")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [48]: Pattern: (Variable("?v0"), Constant(205901), Variable("?v8")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [49]: Pattern: (Variable("?v7"), Constant(779975), Variable("?v0")), Bound: s=false, p=true, o=true +[Bucket Debug] Bucket [50]: Pattern: (Variable("?v1"), Constant(1045092), Variable("?v5")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [51]: Pattern: (Variable("?v1"), Constant(1045090), Variable("?v6")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [52]: Pattern: (Variable("?v1"), Constant(206079), Constant(206080)), Bound: s=true, p=true, o=true +[Bucket Debug] Bucket [53]: Pattern: (Constant(595878), Constant(593640), Variable("?v0")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [54]: Pattern: (Variable("?v0"), Constant(266), Variable("?v1")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [55]: Pattern: (Variable("?v0"), Constant(268), Variable("?v3")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [56]: Pattern: (Variable("?v0"), Constant(278), Variable("?v4")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [57]: Pattern: (Variable("?v1"), Constant(205872), Variable("?v5")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [58]: Pattern: (Variable("?v1"), Constant(1052657), Variable("?v6")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [59]: Pattern: (Variable("?v0"), Constant(779817), Constant(205907)), Bound: s=false, p=true, o=true +[Bucket Debug] Bucket [60]: Pattern: (Variable("?v0"), Constant(779975), Variable("?v2")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [61]: Pattern: (Variable("?v2"), Constant(206061), Variable("?v3")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [62]: Pattern: (Constant(0), Constant(1), Variable("?v1")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [63]: Pattern: (Variable("?v2"), Constant(779875), Variable("?v1")), Bound: s=false, p=true, o=true +[Bucket Debug] Bucket [64]: Pattern: (Variable("?v2"), Constant(779975), Constant(63618)), Bound: s=true, p=true, o=true +[Bucket Debug] Bucket [65]: Pattern: (Variable("?v0"), Constant(779817), Constant(205907)), Bound: s=false, p=true, o=true +[Bucket Debug] Bucket [66]: Pattern: (Variable("?v0"), Constant(779975), Variable("?v1")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [67]: Pattern: (Variable("?v0"), Constant(205887), Constant(205888)), Bound: s=false, p=true, o=true +[Bucket Debug] Bucket [68]: Pattern: (Variable("?v0"), Constant(206061), Variable("?v2")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [69]: Pattern: (Variable("?v0"), Constant(779978), Variable("?v1")), Bound: s=false, p=true, o=false +[Bucket Debug] Bucket [70]: Pattern: (Variable("?v0"), Constant(779875), Variable("?v3")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [71]: Pattern: (Constant(0), Constant(1), Variable("?v3")), Bound: s=true, p=true, o=true +[Bucket Debug] Bucket [72]: Pattern: (Constant(593635), Constant(593640), Variable("?v0")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [73]: Pattern: (Variable("?v0"), Constant(266), Variable("?v1")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [74]: Pattern: (Variable("?v0"), Constant(268), Variable("?v3")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [75]: Pattern: (Variable("?v0"), Constant(270), Variable("?v4")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [76]: Pattern: (Variable("?v0"), Constant(292), Variable("?v5")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [77]: Pattern: (Variable("?v0"), Constant(278), Variable("?v6")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [78]: Pattern: (Variable("?v0"), Constant(272), Variable("?v7")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [79]: Pattern: (Variable("?v0"), Constant(287), Variable("?v8")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [80]: Pattern: (Variable("?v0"), Constant(296), Variable("?v9")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [81]: Pattern: (Variable("?v0"), Constant(779875), Constant(2)), Bound: s=false, p=true, o=true +[Bucket Debug] Bucket [82]: Pattern: (Variable("?v0"), Constant(1052658), Constant(779980)), Bound: s=true, p=true, o=true +[Bucket Debug] Bucket [83]: Pattern: (Variable("?v0"), Constant(779874), Variable("?v1")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [84]: Pattern: (Variable("?v0"), Constant(779869), Variable("?v3")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [85]: Pattern: (Variable("?v0"), Constant(1052658), Constant(206168)), Bound: s=false, p=true, o=true +[Bucket Debug] Bucket [86]: Pattern: (Variable("?v0"), Constant(206061), Variable("?v2")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [87]: Pattern: (Variable("?v0"), Constant(205867), Variable("?v3")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [88]: Pattern: (Variable("?v0"), Constant(206163), Variable("?v4")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [89]: Pattern: (Variable("?v0"), Constant(779919), Constant(779920)), Bound: s=false, p=true, o=true +[Bucket Debug] Bucket [90]: Pattern: (Variable("?v0"), Constant(779875), Constant(11)), Bound: s=true, p=true, o=true +[Bucket Debug] Bucket [91]: Pattern: (Variable("?v0"), Constant(779921), Variable("?v2")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [92]: Pattern: (Variable("?v3"), Constant(205965), Variable("?v0")), Bound: s=false, p=true, o=true +[Bucket Debug] Bucket [93]: Pattern: (Variable("?v0"), Constant(1052658), Constant(205978)), Bound: s=false, p=true, o=true +[Bucket Debug] Bucket [94]: Pattern: (Variable("?v0"), Constant(206079), Constant(206080)), Bound: s=true, p=true, o=true +[Bucket Debug] Bucket [95]: Pattern: (Variable("?v0"), Constant(205874), Variable("?v2")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [96]: Pattern: (Variable("?v0"), Constant(205954), Variable("?v3")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [97]: Pattern: (Variable("?v0"), Constant(205867), Constant(205868)), Bound: s=false, p=true, o=true +[Bucket Debug] Bucket [98]: Pattern: (Variable("?v0"), Constant(206582), Variable("?v1")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [99]: Pattern: (Variable("?v0"), Constant(1052658), Variable("?v2")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [100]: Pattern: (Constant(608516), Constant(779975), Variable("?v0")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [101]: Pattern: (Variable("?v0"), Constant(1052658), Variable("?v1")), Bound: s=true, p=true, o=false +[Bucket Debug] Bucket [102]: Pattern: (Variable("?v0"), Constant(205904), Variable("?v2")), Bound: s=true, p=true, o=false +Indexes built in 4.25 seconds +Successfully processed N-Triples file +============================================== +Running query C1 (3 iterations)... +Average time for C1: 0.004475 seconds +============================================== +Running query C2 (3 iterations)... +Average time for C2: 0.552848 seconds +============================================== +Running query C3 (3 iterations)... +Average time for C3: 0.847724 seconds +============================================== +Running query F1 (3 iterations)... +Average time for F1: 0.001397 seconds +============================================== +Running query F2 (3 iterations)... +Average time for F2: 0.000396 seconds +============================================== +Running query F3 (3 iterations)... +Average time for F3: 0.000446 seconds +============================================== +Running query F4 (3 iterations)... +Average time for F4: 0.004124 seconds +============================================== +Running query F5 (3 iterations)... +Average time for F5: 0.000712 seconds +============================================== +Running query L1 (3 iterations)... +Average time for L1: 0.000435 seconds +============================================== +Running query L2 (3 iterations)... +Average time for L2: 0.001124 seconds +============================================== +Running query L3 (3 iterations)... +Average time for L3: 0.000426 seconds +============================================== +Running query L4 (3 iterations)... +Average time for L4: 0.000925 seconds +============================================== +Running query L5 (3 iterations)... +Average time for L5: 0.002918 seconds +============================================== +Running query S1 (3 iterations)... +Average time for S1: 0.000738 seconds +============================================== +Running query S2 (3 iterations)... +Average time for S2: 0.000557 seconds +============================================== +Running query S3 (3 iterations)... +Average time for S3: 0.000215 seconds +============================================== +Running query S4 (3 iterations)... +Average time for S4: 0.009803 seconds +============================================== +Running query S5 (3 iterations)... +Average time for S5: 0.000188 seconds +============================================== +Running query S6 (3 iterations)... +Average time for S6: 0.000411 seconds +============================================== +Running query S7 (3 iterations)... +Average time for S7: 0.000351 seconds + +[DONE] buckets -> ./benchmark_results/20260407_065605/buckets.txt + +============================================== +[RUN] INDEX_TYPE=pso + Output: ./benchmark_results/20260407_065605/pso.txt +============================================== +warning: unused variable: `i` + --> shared\src\index_manager\buckets.rs:219:14 + | +219 | for (i, b) in self.buckets.iter().enumerate() { + | ^ help: if this is intentional, prefix it with an underscore: `_i` + | + = note: `#[warn(unused_variables)]` (part of `#[warn(unused)]`) on by default + +warning: method `insert_optimized` is never used + --> shared\src\index_manager\single_table.rs:141:6 + | +129 | impl SingleTableIndex { + | --------------------- method in this implementation +... +141 | fn insert_optimized(&mut self, triple: &Triple) -> bool { + | ^^^^^^^^^^^^^^^^ + | + = note: `#[warn(dead_code)]` (part of `#[warn(unused)]`) on by default + +warning: method `merge_scan` is never used + --> shared\src\index_manager\dynamic_hexastore.rs:409:8 + | +316 | impl IndexPool { + | -------------- method in this implementation +... +409 | fn merge_scan(&self, scan_fn: F) -> Option> + | ^^^^^^^^^^ + +warning: `shared` (lib) generated 3 warnings (run `cargo fix --lib -p shared` to apply 1 suggestion) +warning: `shared` (lib) generated 3 warnings (3 duplicates) +warning: unused import: `shared::index_manager::TripleIndex` + --> datalog\src\reasoning_experimental.rs:13:5 + | +13 | use shared::index_manager::TripleIndex; + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + | + = note: `#[warn(unused_imports)]` (part of `#[warn(unused)]`) on by default + + Compiling kolibrie v0.1.1 (C:\Users\mirov\Documents\1.Bestanden\Kolibrie\kolibrie) +warning: `datalog` (lib) generated 1 warning +warning: unused import: `shared::index_manager::TripleIndex` + --> kolibrie\src\storage_manager.rs:16:5 + | +16 | use shared::index_manager::TripleIndex; + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + | + = note: `#[warn(unused_imports)]` (part of `#[warn(unused)]`) on by default + +warning: `kolibrie` (lib) generated 1 warning From 5686a28f079465d1089bfbd323bb93e131eec1a8 Mon Sep 17 00:00:00 2001 From: Mirovh <94124714+Mirovh@users.noreply.github.com> Date: Sat, 11 Apr 2026 11:05:54 +0200 Subject: [PATCH 16/23] Fix buckets --- Cargo.lock | 26 + kolibrie/Cargo.toml | 1 + .../n_triples_data/n_triple_10M.rs | 2 +- .../sparql_syntax/n_triples_data/temp.txt | 451 -- kolibrie/src/sparql_database.rs | 6908 +++++++++-------- shared/src/index_manager/buckets.rs | 804 +- shared/src/query.rs | 10 + 7 files changed, 4188 insertions(+), 4014 deletions(-) delete mode 100644 kolibrie/examples/sparql_syntax/n_triples_data/temp.txt diff --git a/Cargo.lock b/Cargo.lock index e059bec..7d44e47 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -803,6 +803,7 @@ dependencies = [ "serde", "serde_json", "shared", + "sysinfo", "url", "winapi", ] @@ -910,6 +911,15 @@ dependencies = [ "memchr", ] +[[package]] +name = "ntapi" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3b335231dfd352ffb0f8017f3b6027a4917f7df785ea2143d8af2adc66980ae" +dependencies = [ + "winapi", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -1428,6 +1438,7 @@ dependencies = [ "rayon", "serde", "serde_json", + "sysinfo", ] [[package]] @@ -1518,6 +1529,21 @@ dependencies = [ "syn 2.0.104", ] +[[package]] +name = "sysinfo" +version = "0.29.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd727fc423c2060f6c92d9534cef765c65a6ed3f428a03d7def74a8c4348e666" +dependencies = [ + "cfg-if", + "core-foundation-sys", + "libc", + "ntapi", + "once_cell", + "rayon", + "winapi", +] + [[package]] name = "target-lexicon" version = "0.12.16" diff --git a/kolibrie/Cargo.toml b/kolibrie/Cargo.toml index 2b147d4..159f917 100644 --- a/kolibrie/Cargo.toml +++ b/kolibrie/Cargo.toml @@ -17,6 +17,7 @@ exclude = ["target/"] build = "build.rs" [dependencies] +sysinfo = "0.29" quick-xml = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } diff --git a/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M.rs b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M.rs index 62baef4..7a15a1c 100644 --- a/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M.rs +++ b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M.rs @@ -606,7 +606,7 @@ fn parse_large_ntriples_file( } fn run_all_queries(db: &mut SparqlDatabase, workload: &[QuerySpec]) { - const ITERATIONS: usize = 3; + const ITERATIONS: usize = 10; for (name, query) in workload.iter() { println!("=============================================="); diff --git a/kolibrie/examples/sparql_syntax/n_triples_data/temp.txt b/kolibrie/examples/sparql_syntax/n_triples_data/temp.txt deleted file mode 100644 index dbe5c39..0000000 --- a/kolibrie/examples/sparql_syntax/n_triples_data/temp.txt +++ /dev/null @@ -1,451 +0,0 @@ -============================================== - Kolibrie Index Benchmark Runner -============================================== -Output directory: ./benchmark_results/20260407_065605 -Index types: buckets pso hexastore dynamic_hexastore ops osp pos sop spo table -============================================== - -[BUILD] Compiling in release mode... -warning: unused variable: `i` - --> shared\src\index_manager\buckets.rs:219:14 - | -219 | for (i, b) in self.buckets.iter().enumerate() { - | ^ help: if this is intentional, prefix it with an underscore: `_i` - | - = note: `#[warn(unused_variables)]` (part of `#[warn(unused)]`) on by default - -warning: method `insert_optimized` is never used - --> shared\src\index_manager\single_table.rs:141:6 - | -129 | impl SingleTableIndex { - | --------------------- method in this implementation -... -141 | fn insert_optimized(&mut self, triple: &Triple) -> bool { - | ^^^^^^^^^^^^^^^^ - | - = note: `#[warn(dead_code)]` (part of `#[warn(unused)]`) on by default - -warning: method `merge_scan` is never used - --> shared\src\index_manager\dynamic_hexastore.rs:409:8 - | -316 | impl IndexPool { - | -------------- method in this implementation -... -409 | fn merge_scan(&self, scan_fn: F) -> Option> - | ^^^^^^^^^^ - -warning: `shared` (lib) generated 3 warnings (run `cargo fix --lib -p shared` to apply 1 suggestion) -warning: `shared` (lib) generated 3 warnings (3 duplicates) - Compiling kolibrie v0.1.1 (C:\Users\mirov\Documents\1.Bestanden\Kolibrie\kolibrie) -warning: unused import: `shared::index_manager::TripleIndex` - --> datalog\src\reasoning_experimental.rs:13:5 - | -13 | use shared::index_manager::TripleIndex; - | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - | - = note: `#[warn(unused_imports)]` (part of `#[warn(unused)]`) on by default - -warning: `datalog` (lib) generated 1 warning -warning: unused import: `shared::index_manager::TripleIndex` - --> kolibrie\src\storage_manager.rs:16:5 - | -16 | use shared::index_manager::TripleIndex; - | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - | - = note: `#[warn(unused_imports)]` (part of `#[warn(unused)]`) on by default - -warning: `kolibrie` (lib) generated 1 warning - Finished `release` profile [optimized + debuginfo] target(s) in 24.04s -[BUILD] Done. - -============================================== -[RUN] INDEX_TYPE=buckets - Output: ./benchmark_results/20260407_065605/buckets.txt -============================================== -warning: unused variable: `i` - --> shared\src\index_manager\buckets.rs:219:14 - | -219 | for (i, b) in self.buckets.iter().enumerate() { - | ^ help: if this is intentional, prefix it with an underscore: `_i` - | - = note: `#[warn(unused_variables)]` (part of `#[warn(unused)]`) on by default - -warning: method `insert_optimized` is never used - --> shared\src\index_manager\single_table.rs:141:6 - | -129 | impl SingleTableIndex { - | --------------------- method in this implementation -... -141 | fn insert_optimized(&mut self, triple: &Triple) -> bool { - | ^^^^^^^^^^^^^^^^ - | - = note: `#[warn(dead_code)]` (part of `#[warn(unused)]`) on by default - -warning: method `merge_scan` is never used - --> shared\src\index_manager\dynamic_hexastore.rs:409:8 - | -316 | impl IndexPool { - | -------------- method in this implementation -... -409 | fn merge_scan(&self, scan_fn: F) -> Option> - | ^^^^^^^^^^ - -warning: `shared` (lib) generated 3 warnings (run `cargo fix --lib -p shared` to apply 1 suggestion) -warning: `shared` (lib) generated 3 warnings (3 duplicates) -warning: unused import: `shared::index_manager::TripleIndex` - --> datalog\src\reasoning_experimental.rs:13:5 - | -13 | use shared::index_manager::TripleIndex; - | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - | - = note: `#[warn(unused_imports)]` (part of `#[warn(unused)]`) on by default - - Compiling kolibrie v0.1.1 (C:\Users\mirov\Documents\1.Bestanden\Kolibrie\kolibrie) -warning: `datalog` (lib) generated 1 warning -warning: unused import: `shared::index_manager::TripleIndex` - --> kolibrie\src\storage_manager.rs:16:5 - | -16 | use shared::index_manager::TripleIndex; - | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - | - = note: `#[warn(unused_imports)]` (part of `#[warn(unused)]`) on by default - -warning: `kolibrie` (lib) generated 1 warning - Finished `release` profile [optimized + debuginfo] target(s) in 23.97s - Running `C:\Users\mirov\Documents\1.Bestanden\Kolibrie\target\release\examples\n_triple_10M.exe` -INDEX_TYPE = buckets -Starting to parse N-Triples file: ../benchmark_dataset/watdiv.10M.nt -Processed 100000 triples -Processed 200000 triples -Processed 300000 triples -Processed 400000 triples -Processed 500000 triples -Processed 600000 triples -Processed 700000 triples -Processed 800000 triples -Processed 900000 triples -Processed 1000000 triples -Processed 1100000 triples -Processed 1200000 triples -Processed 1300000 triples -Processed 1400000 triples -Processed 1500000 triples -Processed 1600000 triples -Processed 1700000 triples -Processed 1800000 triples -Processed 1900000 triples -Processed 2000000 triples -Processed 2100000 triples -Processed 2200000 triples -Processed 2300000 triples -Processed 2400000 triples -Processed 2500000 triples -Processed 2600000 triples -Processed 2700000 triples -Processed 2800000 triples -Processed 2900000 triples -Processed 3000000 triples -Processed 3100000 triples -Processed 3200000 triples -Processed 3300000 triples -Processed 3400000 triples -Processed 3500000 triples -Processed 3600000 triples -Processed 3700000 triples -Processed 3800000 triples -Processed 3900000 triples -Processed 4000000 triples -Processed 4100000 triples -Processed 4200000 triples -Processed 4300000 triples -Processed 4400000 triples -Processed 4500000 triples -Processed 4600000 triples -Processed 4700000 triples -Processed 4800000 triples -Processed 4900000 triples -Processed 5000000 triples -Processed 5100000 triples -Processed 5200000 triples -Processed 5300000 triples -Processed 5400000 triples -Processed 5500000 triples -Processed 5600000 triples -Processed 5700000 triples -Processed 5800000 triples -Processed 5900000 triples -Processed 6000000 triples -Processed 6100000 triples -Processed 6200000 triples -Processed 6300000 triples -Processed 6400000 triples -Processed 6500000 triples -Processed 6600000 triples -Processed 6700000 triples -Processed 6800000 triples -Processed 6900000 triples -Processed 7000000 triples -Processed 7100000 triples -Processed 7200000 triples -Processed 7300000 triples -Processed 7400000 triples -Processed 7500000 triples -Processed 7600000 triples -Processed 7700000 triples -Processed 7800000 triples -Processed 7900000 triples -Processed 8000000 triples -Processed 8100000 triples -Processed 8200000 triples -Processed 8300000 triples -Processed 8400000 triples -Processed 8500000 triples -Processed 8600000 triples -Processed 8700000 triples -Processed 8800000 triples -Processed 8900000 triples -Processed 9000000 triples -Processed 9100000 triples -Processed 9200000 triples -Processed 9300000 triples -Processed 9400000 triples -Processed 9500000 triples -Processed 9600000 triples -Processed 9700000 triples -Processed 9800000 triples -Processed 9900000 triples -Processed 10000000 triples -Processed 10100000 triples -Processed 10200000 triples -Processed 10300000 triples -Processed 10400000 triples -Processed 10500000 triples -Processed 10600000 triples -Processed 10700000 triples -Processed 10800000 triples -Processed 10900000 triples -Finished parsing 10916457 triples in 14.98 seconds -Building indexes... -[Bucket Debug] --- BucketIndex Initialization --- -[Bucket Debug] Requested planned patterns: 103 -[Bucket Debug] Bucket [0]: Pattern: (Variable("?v0"), Constant(206061), Variable("?v1")), Bound: s=false, p=true, o=false -[Bucket Debug] Bucket [1]: Pattern: (Variable("?v0"), Constant(205904), Variable("?v2")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [2]: Pattern: (Variable("?v0"), Constant(205881), Variable("?v3")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [3]: Pattern: (Variable("?v0"), Constant(205988), Variable("?v4")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [4]: Pattern: (Variable("?v4"), Constant(601766), Variable("?v5")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [5]: Pattern: (Variable("?v4"), Constant(601764), Variable("?v6")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [6]: Pattern: (Variable("?v7"), Constant(206401), Variable("?v6")), Bound: s=false, p=true, o=true -[Bucket Debug] Bucket [7]: Pattern: (Variable("?v7"), Constant(206079), Variable("?v8")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [8]: Pattern: (Variable("?v2"), Constant(287), Constant(22)), Bound: s=false, p=true, o=true -[Bucket Debug] Bucket [9]: Pattern: (Variable("?v0"), Constant(593640), Variable("?v2")), Bound: s=false, p=true, o=true -[Bucket Debug] Bucket [10]: Pattern: (Variable("?v2"), Constant(266), Variable("?v3")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [11]: Pattern: (Variable("?v4"), Constant(779978), Variable("?v5")), Bound: s=false, p=true, o=false -[Bucket Debug] Bucket [12]: Pattern: (Variable("?v4"), Constant(205883), Variable("?v6")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [13]: Pattern: (Variable("?v4"), Constant(780739), Variable("?v7")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [14]: Pattern: (Variable("?v7"), Constant(443313), Variable("?v3")), Bound: s=true, p=true, o=true -[Bucket Debug] Bucket [15]: Pattern: (Variable("?v3"), Constant(205988), Variable("?v8")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [16]: Pattern: (Variable("?v0"), Constant(593711), Variable("?v1")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [17]: Pattern: (Variable("?v8"), Constant(601879), Variable("?v9")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [18]: Pattern: (Variable("?v0"), Constant(779975), Variable("?v1")), Bound: s=false, p=true, o=false -[Bucket Debug] Bucket [19]: Pattern: (Variable("?v0"), Constant(779930), Variable("?v2")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [20]: Pattern: (Variable("?v0"), Constant(779874), Variable("?v3")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [21]: Pattern: (Variable("?v0"), Constant(779919), Variable("?v4")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [22]: Pattern: (Variable("?v0"), Constant(779869), Variable("?v5")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [23]: Pattern: (Variable("?v0"), Constant(779922), Variable("?v6")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [24]: Pattern: (Variable("?v3"), Constant(205876), Constant(205978)), Bound: s=false, p=true, o=true -[Bucket Debug] Bucket [25]: Pattern: (Variable("?v3"), Constant(208830), Variable("?v4")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [26]: Pattern: (Variable("?v3"), Constant(205954), Variable("?v5")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [27]: Pattern: (Variable("?v3"), Constant(205867), Variable("?v0")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [28]: Pattern: (Variable("?v0"), Constant(205887), Constant(205986)), Bound: s=true, p=true, o=true -[Bucket Debug] Bucket [29]: Pattern: (Variable("?v0"), Constant(205876), Variable("?v2")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [30]: Pattern: (Variable("?v0"), Constant(205867), Constant(209611)), Bound: s=false, p=true, o=true -[Bucket Debug] Bucket [31]: Pattern: (Variable("?v0"), Constant(205883), Variable("?v1")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [32]: Pattern: (Variable("?v0"), Constant(205872), Variable("?v2")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [33]: Pattern: (Variable("?v0"), Constant(1052657), Variable("?v3")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [34]: Pattern: (Variable("?v0"), Constant(206061), Variable("?v4")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [35]: Pattern: (Variable("?v0"), Constant(205874), Variable("?v5")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [36]: Pattern: (Variable("?v1"), Constant(1045092), Variable("?v6")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [37]: Pattern: (Variable("?v1"), Constant(1045090), Variable("?v7")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [38]: Pattern: (Variable("?v0"), Constant(205867), Constant(254774)), Bound: s=false, p=true, o=true -[Bucket Debug] Bucket [39]: Pattern: (Variable("?v0"), Constant(205881), Variable("?v1")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [40]: Pattern: (Variable("?v0"), Constant(205901), Variable("?v2")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [41]: Pattern: (Variable("?v5"), Constant(443313), Variable("?v0")), Bound: s=false, p=true, o=true -[Bucket Debug] Bucket [42]: Pattern: (Variable("?v4"), Constant(780739), Variable("?v5")), Bound: s=false, p=true, o=true -[Bucket Debug] Bucket [43]: Pattern: (Variable("?v5"), Constant(443311), Variable("?v6")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [44]: Pattern: (Variable("?v0"), Constant(205887), Constant(205942)), Bound: s=false, p=true, o=true -[Bucket Debug] Bucket [45]: Pattern: (Variable("?v0"), Constant(205883), Variable("?v1")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [46]: Pattern: (Variable("?v2"), Constant(266), Variable("?v0")), Bound: s=false, p=true, o=true -[Bucket Debug] Bucket [47]: Pattern: (Variable("?v0"), Constant(205874), Variable("?v4")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [48]: Pattern: (Variable("?v0"), Constant(205901), Variable("?v8")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [49]: Pattern: (Variable("?v7"), Constant(779975), Variable("?v0")), Bound: s=false, p=true, o=true -[Bucket Debug] Bucket [50]: Pattern: (Variable("?v1"), Constant(1045092), Variable("?v5")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [51]: Pattern: (Variable("?v1"), Constant(1045090), Variable("?v6")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [52]: Pattern: (Variable("?v1"), Constant(206079), Constant(206080)), Bound: s=true, p=true, o=true -[Bucket Debug] Bucket [53]: Pattern: (Constant(595878), Constant(593640), Variable("?v0")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [54]: Pattern: (Variable("?v0"), Constant(266), Variable("?v1")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [55]: Pattern: (Variable("?v0"), Constant(268), Variable("?v3")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [56]: Pattern: (Variable("?v0"), Constant(278), Variable("?v4")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [57]: Pattern: (Variable("?v1"), Constant(205872), Variable("?v5")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [58]: Pattern: (Variable("?v1"), Constant(1052657), Variable("?v6")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [59]: Pattern: (Variable("?v0"), Constant(779817), Constant(205907)), Bound: s=false, p=true, o=true -[Bucket Debug] Bucket [60]: Pattern: (Variable("?v0"), Constant(779975), Variable("?v2")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [61]: Pattern: (Variable("?v2"), Constant(206061), Variable("?v3")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [62]: Pattern: (Constant(0), Constant(1), Variable("?v1")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [63]: Pattern: (Variable("?v2"), Constant(779875), Variable("?v1")), Bound: s=false, p=true, o=true -[Bucket Debug] Bucket [64]: Pattern: (Variable("?v2"), Constant(779975), Constant(63618)), Bound: s=true, p=true, o=true -[Bucket Debug] Bucket [65]: Pattern: (Variable("?v0"), Constant(779817), Constant(205907)), Bound: s=false, p=true, o=true -[Bucket Debug] Bucket [66]: Pattern: (Variable("?v0"), Constant(779975), Variable("?v1")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [67]: Pattern: (Variable("?v0"), Constant(205887), Constant(205888)), Bound: s=false, p=true, o=true -[Bucket Debug] Bucket [68]: Pattern: (Variable("?v0"), Constant(206061), Variable("?v2")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [69]: Pattern: (Variable("?v0"), Constant(779978), Variable("?v1")), Bound: s=false, p=true, o=false -[Bucket Debug] Bucket [70]: Pattern: (Variable("?v0"), Constant(779875), Variable("?v3")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [71]: Pattern: (Constant(0), Constant(1), Variable("?v3")), Bound: s=true, p=true, o=true -[Bucket Debug] Bucket [72]: Pattern: (Constant(593635), Constant(593640), Variable("?v0")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [73]: Pattern: (Variable("?v0"), Constant(266), Variable("?v1")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [74]: Pattern: (Variable("?v0"), Constant(268), Variable("?v3")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [75]: Pattern: (Variable("?v0"), Constant(270), Variable("?v4")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [76]: Pattern: (Variable("?v0"), Constant(292), Variable("?v5")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [77]: Pattern: (Variable("?v0"), Constant(278), Variable("?v6")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [78]: Pattern: (Variable("?v0"), Constant(272), Variable("?v7")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [79]: Pattern: (Variable("?v0"), Constant(287), Variable("?v8")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [80]: Pattern: (Variable("?v0"), Constant(296), Variable("?v9")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [81]: Pattern: (Variable("?v0"), Constant(779875), Constant(2)), Bound: s=false, p=true, o=true -[Bucket Debug] Bucket [82]: Pattern: (Variable("?v0"), Constant(1052658), Constant(779980)), Bound: s=true, p=true, o=true -[Bucket Debug] Bucket [83]: Pattern: (Variable("?v0"), Constant(779874), Variable("?v1")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [84]: Pattern: (Variable("?v0"), Constant(779869), Variable("?v3")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [85]: Pattern: (Variable("?v0"), Constant(1052658), Constant(206168)), Bound: s=false, p=true, o=true -[Bucket Debug] Bucket [86]: Pattern: (Variable("?v0"), Constant(206061), Variable("?v2")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [87]: Pattern: (Variable("?v0"), Constant(205867), Variable("?v3")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [88]: Pattern: (Variable("?v0"), Constant(206163), Variable("?v4")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [89]: Pattern: (Variable("?v0"), Constant(779919), Constant(779920)), Bound: s=false, p=true, o=true -[Bucket Debug] Bucket [90]: Pattern: (Variable("?v0"), Constant(779875), Constant(11)), Bound: s=true, p=true, o=true -[Bucket Debug] Bucket [91]: Pattern: (Variable("?v0"), Constant(779921), Variable("?v2")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [92]: Pattern: (Variable("?v3"), Constant(205965), Variable("?v0")), Bound: s=false, p=true, o=true -[Bucket Debug] Bucket [93]: Pattern: (Variable("?v0"), Constant(1052658), Constant(205978)), Bound: s=false, p=true, o=true -[Bucket Debug] Bucket [94]: Pattern: (Variable("?v0"), Constant(206079), Constant(206080)), Bound: s=true, p=true, o=true -[Bucket Debug] Bucket [95]: Pattern: (Variable("?v0"), Constant(205874), Variable("?v2")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [96]: Pattern: (Variable("?v0"), Constant(205954), Variable("?v3")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [97]: Pattern: (Variable("?v0"), Constant(205867), Constant(205868)), Bound: s=false, p=true, o=true -[Bucket Debug] Bucket [98]: Pattern: (Variable("?v0"), Constant(206582), Variable("?v1")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [99]: Pattern: (Variable("?v0"), Constant(1052658), Variable("?v2")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [100]: Pattern: (Constant(608516), Constant(779975), Variable("?v0")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [101]: Pattern: (Variable("?v0"), Constant(1052658), Variable("?v1")), Bound: s=true, p=true, o=false -[Bucket Debug] Bucket [102]: Pattern: (Variable("?v0"), Constant(205904), Variable("?v2")), Bound: s=true, p=true, o=false -Indexes built in 4.25 seconds -Successfully processed N-Triples file -============================================== -Running query C1 (3 iterations)... -Average time for C1: 0.004475 seconds -============================================== -Running query C2 (3 iterations)... -Average time for C2: 0.552848 seconds -============================================== -Running query C3 (3 iterations)... -Average time for C3: 0.847724 seconds -============================================== -Running query F1 (3 iterations)... -Average time for F1: 0.001397 seconds -============================================== -Running query F2 (3 iterations)... -Average time for F2: 0.000396 seconds -============================================== -Running query F3 (3 iterations)... -Average time for F3: 0.000446 seconds -============================================== -Running query F4 (3 iterations)... -Average time for F4: 0.004124 seconds -============================================== -Running query F5 (3 iterations)... -Average time for F5: 0.000712 seconds -============================================== -Running query L1 (3 iterations)... -Average time for L1: 0.000435 seconds -============================================== -Running query L2 (3 iterations)... -Average time for L2: 0.001124 seconds -============================================== -Running query L3 (3 iterations)... -Average time for L3: 0.000426 seconds -============================================== -Running query L4 (3 iterations)... -Average time for L4: 0.000925 seconds -============================================== -Running query L5 (3 iterations)... -Average time for L5: 0.002918 seconds -============================================== -Running query S1 (3 iterations)... -Average time for S1: 0.000738 seconds -============================================== -Running query S2 (3 iterations)... -Average time for S2: 0.000557 seconds -============================================== -Running query S3 (3 iterations)... -Average time for S3: 0.000215 seconds -============================================== -Running query S4 (3 iterations)... -Average time for S4: 0.009803 seconds -============================================== -Running query S5 (3 iterations)... -Average time for S5: 0.000188 seconds -============================================== -Running query S6 (3 iterations)... -Average time for S6: 0.000411 seconds -============================================== -Running query S7 (3 iterations)... -Average time for S7: 0.000351 seconds - -[DONE] buckets -> ./benchmark_results/20260407_065605/buckets.txt - -============================================== -[RUN] INDEX_TYPE=pso - Output: ./benchmark_results/20260407_065605/pso.txt -============================================== -warning: unused variable: `i` - --> shared\src\index_manager\buckets.rs:219:14 - | -219 | for (i, b) in self.buckets.iter().enumerate() { - | ^ help: if this is intentional, prefix it with an underscore: `_i` - | - = note: `#[warn(unused_variables)]` (part of `#[warn(unused)]`) on by default - -warning: method `insert_optimized` is never used - --> shared\src\index_manager\single_table.rs:141:6 - | -129 | impl SingleTableIndex { - | --------------------- method in this implementation -... -141 | fn insert_optimized(&mut self, triple: &Triple) -> bool { - | ^^^^^^^^^^^^^^^^ - | - = note: `#[warn(dead_code)]` (part of `#[warn(unused)]`) on by default - -warning: method `merge_scan` is never used - --> shared\src\index_manager\dynamic_hexastore.rs:409:8 - | -316 | impl IndexPool { - | -------------- method in this implementation -... -409 | fn merge_scan(&self, scan_fn: F) -> Option> - | ^^^^^^^^^^ - -warning: `shared` (lib) generated 3 warnings (run `cargo fix --lib -p shared` to apply 1 suggestion) -warning: `shared` (lib) generated 3 warnings (3 duplicates) -warning: unused import: `shared::index_manager::TripleIndex` - --> datalog\src\reasoning_experimental.rs:13:5 - | -13 | use shared::index_manager::TripleIndex; - | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - | - = note: `#[warn(unused_imports)]` (part of `#[warn(unused)]`) on by default - - Compiling kolibrie v0.1.1 (C:\Users\mirov\Documents\1.Bestanden\Kolibrie\kolibrie) -warning: `datalog` (lib) generated 1 warning -warning: unused import: `shared::index_manager::TripleIndex` - --> kolibrie\src\storage_manager.rs:16:5 - | -16 | use shared::index_manager::TripleIndex; - | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - | - = note: `#[warn(unused_imports)]` (part of `#[warn(unused)]`) on by default - -warning: `kolibrie` (lib) generated 1 warning diff --git a/kolibrie/src/sparql_database.rs b/kolibrie/src/sparql_database.rs index 3316e1f..8f26c74 100644 --- a/kolibrie/src/sparql_database.rs +++ b/kolibrie/src/sparql_database.rs @@ -8,19 +8,16 @@ * you can obtain one at https://mozilla.org/MPL/2.0/. */ -use shared::dictionary::Dictionary; -use crate::sliding_window::SlidingWindow; -use shared::triple::TimestampedTriple; -use shared::triple::Triple; -use shared::query::FilterExpression; +#[cfg(feature = "cuda")] +use crate::cuda::cuda_join::*; use crate::parser; +use crate::parser::convert_triple_pattern; +use crate::query_builder::QueryBuilder; +use crate::sliding_window::SlidingWindow; +use crate::streamertail_optimizer::DatabaseStats; use crate::utils; use crate::utils::current_timestamp; use crate::utils::ClonableFn; -#[cfg(feature = "cuda")] -use crate::cuda::cuda_join::*; -use shared::index_manager::TripleIndex; -use crate::query_builder::QueryBuilder; use crossbeam::channel::unbounded; use crossbeam::scope; use percent_encoding::percent_decode; @@ -28,21 +25,26 @@ use quick_xml::events::Event; use quick_xml::name::QName; use quick_xml::Reader; use rayon::prelude::*; -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -use std::arch::x86_64::*; +use shared::dictionary::Dictionary; +use shared::index_manager::TripleIndex; +use shared::index_manager::{ + BucketIndex, DynamicHexastoreIndex, HexastoreIndex, IndexConfig, OPSSingleIndex, + OSPSingleIndex, POSSingleIndex, PSOSingleIndex, SOPSingleIndex, SPOSingleIndex, + SingleTableIndex, +}; +use shared::query::FilterExpression; +use shared::terms::TriplePattern; +use shared::triple::TimestampedTriple; +use shared::triple::Triple; #[cfg(target_arch = "aarch64")] use std::arch::aarch64::*; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use std::arch::x86_64::*; use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::sync::Arc; use std::sync::{Mutex, RwLock}; +use sysinfo::{ProcessExt, System, SystemExt}; use url::Url; -use crate::streamertail_optimizer::DatabaseStats; -use shared::index_manager::{ - IndexConfig, HexastoreIndex, SPOSingleIndex, POSSingleIndex, OSPSingleIndex, - PSOSingleIndex, OPSSingleIndex, SOPSingleIndex, SingleTableIndex, DynamicHexastoreIndex, BucketIndex -}; -use shared::terms::TriplePattern; -use crate::parser::convert_triple_pattern; const MIN_CHUNK_SIZE: usize = 1024; const HASHMAP_INITIAL_CAPACITY: usize = 4096; @@ -52,3409 +54,3753 @@ const HASHMAP_INITIAL_CAPACITY1: usize = 1024; #[derive(Debug, Clone)] pub struct SparqlDatabase { - pub triples: BTreeSet, - pub streams: Vec, - pub sliding_window: Option, - pub dictionary: Arc>, - pub prefixes: HashMap, - pub udfs: HashMap, - pub index_manager: Option>, - pub rule_map: HashMap, - pub cached_stats: Option>, - index_config: IndexConfig, + pub triples: BTreeSet, + pub streams: Vec, + pub sliding_window: Option, + pub dictionary: Arc>, + pub prefixes: HashMap, + pub udfs: HashMap, + pub index_manager: Option>, + pub rule_map: HashMap, + pub cached_stats: Option>, + index_config: IndexConfig, } #[allow(dead_code)] impl SparqlDatabase { - pub fn new() -> Self { - Self::with_config(IndexConfig::Hexastore) - } - - /// Creates a new database with a user-chosen indexing strategy. - pub fn with_config(config: IndexConfig) -> Self { - Self { - triples: BTreeSet::new(), - streams: Vec::new(), - sliding_window: None, - dictionary: Arc::new(RwLock::new(Dictionary::new())), - prefixes: HashMap::new(), - udfs: HashMap::new(), - index_manager: None, - rule_map: HashMap::new(), - cached_stats: None, - index_config: config, - } - } - pub fn set_prefixes(&mut self, prefixes: HashMap){ - self.prefixes=prefixes; - } - - fn make_initial_index(config: &IndexConfig) -> Box { - match config { - IndexConfig::Hexastore => Box::new(HexastoreIndex::new()), - IndexConfig::SPO => Box::new(SPOSingleIndex::new()), - IndexConfig::POS => Box::new(POSSingleIndex::new()), - IndexConfig::OSP => Box::new(OSPSingleIndex::new()), - IndexConfig::PSO => Box::new(PSOSingleIndex::new()), - IndexConfig::OPS => Box::new(OPSSingleIndex::new()), - IndexConfig::SOP => Box::new(SOPSingleIndex::new()), - IndexConfig::SingleTable => Box::new(SingleTableIndex::new()), - // Pattern-dependent indexes start as hexastore; - // `build_all_indexes` will swap them out. - IndexConfig::DynamicHexastore { .. } => Box::new(HexastoreIndex::new()), - IndexConfig::Buckets { .. } => Box::new(HexastoreIndex::new()), - } - } - - fn resolve_query_patterns(&self, raw_queries: &[String]) -> Vec { - let mut patterns = Vec::new(); - - for query_str in raw_queries { - // parse_sparql_query returns a big tuple; field index 2 - // is the Vec of raw (&str, &str, &str) triple patterns, - // field index 5 is the HashMap of prefixes. - if let Ok((_rest, parsed)) = crate::parser::parse_sparql_query(query_str) { - let raw_patterns = parsed.2; // Vec<(&str, &str, &str)> - let query_prefixes = parsed.5; // HashMap - - // Merge query prefixes with database prefixes - let mut all_prefixes = self.prefixes.clone(); - for (k, v) in query_prefixes { - all_prefixes.insert(k, v); - } + pub fn new() -> Self { + Self::with_config(IndexConfig::Hexastore) + } + + /// Creates a new database with a user-chosen indexing strategy. + pub fn with_config(config: IndexConfig) -> Self { + Self { + triples: BTreeSet::new(), + streams: Vec::new(), + sliding_window: None, + dictionary: Arc::new(RwLock::new(Dictionary::new())), + prefixes: HashMap::new(), + udfs: HashMap::new(), + index_manager: None, + rule_map: HashMap::new(), + cached_stats: None, + index_config: config, + } + } + pub fn set_prefixes(&mut self, prefixes: HashMap) { + self.prefixes = prefixes; + } + + fn make_initial_index(config: &IndexConfig) -> Box { + match config { + IndexConfig::Hexastore => Box::new(HexastoreIndex::new()), + IndexConfig::SPO => Box::new(SPOSingleIndex::new()), + IndexConfig::POS => Box::new(POSSingleIndex::new()), + IndexConfig::OSP => Box::new(OSPSingleIndex::new()), + IndexConfig::PSO => Box::new(PSOSingleIndex::new()), + IndexConfig::OPS => Box::new(OPSSingleIndex::new()), + IndexConfig::SOP => Box::new(SOPSingleIndex::new()), + IndexConfig::SingleTable => Box::new(SingleTableIndex::new()), + // Pattern-dependent indexes start as hexastore; + // `build_all_indexes` will swap them out. + IndexConfig::DynamicHexastore { .. } => Box::new(HexastoreIndex::new()), + IndexConfig::Buckets { .. } => Box::new(HexastoreIndex::new()), + } + } + + fn resolve_planned_access_patterns( + &mut self, + raw_queries: &[String], + ) -> Vec { + use crate::streamertail_optimizer::operators::PhysicalOperator; + use crate::streamertail_optimizer::utils::build_logical_plan; + use crate::streamertail_optimizer::Streamertail; + use shared::query::PlannedAccessPattern; + use shared::terms::Term; + use std::collections::HashSet; + + let mut planned_patterns = Vec::new(); + + for query_str in raw_queries { + if let Ok(( + _, + ( + _insert_clause, + variables, + patterns, + filters, + _group_vars, + parsed_prefixes, + values_clause, + binds, + _subqueries, + _limit, + _, + _order_conditions, + ), + )) = crate::parser::parse_sparql_query(query_str) + { + let mut prefixes = self.prefixes.clone(); + for (k, v) in parsed_prefixes { + prefixes.insert(k, v); + } - let mut dict = self.dictionary.write().unwrap(); - for triple in raw_patterns { - patterns.push(convert_triple_pattern(triple, &mut dict, &all_prefixes)); - } - } - } - - patterns - } - - pub fn build_all_indexes(&mut self) { - let triples: Vec = self.triples.iter().cloned().collect(); - - let mut index: Box = match &self.index_config { - IndexConfig::Hexastore => Box::new(HexastoreIndex::new()), - IndexConfig::SPO => Box::new(SPOSingleIndex::new()), - IndexConfig::POS => Box::new(POSSingleIndex::new()), - IndexConfig::OSP => Box::new(OSPSingleIndex::new()), - IndexConfig::PSO => Box::new(PSOSingleIndex::new()), - IndexConfig::OPS => Box::new(OPSSingleIndex::new()), - IndexConfig::SOP => Box::new(SOPSingleIndex::new()), - IndexConfig::SingleTable => Box::new(SingleTableIndex::new()), - - IndexConfig::DynamicHexastore { eval_interval, queries } => { - let patterns = self.resolve_query_patterns(queries); - let eval = *eval_interval as usize; - Box::new(DynamicHexastoreIndex::new(patterns, eval)) - } - - IndexConfig::Buckets { queries } => { - let patterns = self.resolve_query_patterns(queries); - Box::new(BucketIndex::new(patterns)) - } - - // Future index types go here: - // IndexConfig::YourNewIndex { some_param, queries } => { - // let patterns = self.resolve_query_patterns(queries); - // Box::new(YourNewIndex::new(patterns, *some_param)) - // } - }; - - index.build_from_triples(&triples); - index.optimize(); - self.index_manager = Some(index); - } - - /// Get a reference to the index. - /// Panics if `build_all_indexes()` hasn't been called yet. - pub fn index(&self) -> &dyn TripleIndex { - self.index_manager - .as_deref() - .expect("index not built — call build_all_indexes() first") - } - - /// Get a mutable reference to the index. - /// Panics if `build_all_indexes()` hasn't been called yet. - pub fn index_mut(&mut self) -> &mut dyn TripleIndex { - self.index_manager - .as_deref_mut() - .expect("index not built — call build_all_indexes() first") - } - - pub fn get_or_build_stats(&mut self) -> Arc { - if let Some(stats) = &self.cached_stats { - return stats.clone(); // ← Clone the Arc (cheap), not the DatabaseStats - } - - let stats = Arc::new(DatabaseStats::gather_stats_fast(self)); - self.cached_stats = Some(stats.clone()); - stats - } - - pub fn invalidate_stats_cache(&mut self) { - self.cached_stats = None; - } - - pub fn query(&self) -> QueryBuilder<'_> { - QueryBuilder::new(self) - } - - pub fn add_triple(&mut self, triple: Triple) { - self.triples.insert(triple.clone()); - if let Some(ref mut idx) = self.index_manager { - idx.insert(&triple); - } - } - - pub fn delete_triple(&mut self, triple: &Triple) -> bool { - let removed = self.triples.remove(triple); - if removed { - if let Some(ref mut idx) = self.index_manager { - idx.delete(triple); - } - } - removed - } - - /// Helper function that accepts parts of a triple, constructs a Triple, and adds it - pub fn add_triple_parts(&mut self, subject: &str, predicate: &str, object: &str) { - let mut dict = self.dictionary.write().unwrap(); - let subject_id = dict.encode(subject); - let predicate_id = dict.encode(predicate); - let object_id = dict.encode(object); - drop(dict); - - let triple = Triple { - subject: subject_id, - predicate: predicate_id, - object: object_id, - }; - self.add_triple(triple); - } - - /// Helper function that accepts parts of a triple, constructs a Triple, and deletes it - pub fn delete_triple_parts(&mut self, subject: &str, predicate: &str, object: &str) -> bool { - let mut dict = self.dictionary.write().unwrap(); - let subject_id = dict.encode(subject); - let predicate_id = dict.encode(predicate); - let object_id = dict.encode(object); - drop(dict); - - let triple = Triple { - subject: subject_id, - predicate: predicate_id, - object: object_id, - }; - self.delete_triple(&triple) - } - - pub fn generate_rdf_xml(&mut self) -> String { - let mut xml = String::new(); - xml.push_str("\n"); - xml.push_str("\n"); - - // Group triples by subject - let dict = self.dictionary.read().unwrap(); - let mut subjects: BTreeMap> = BTreeMap::new(); - for triple in &self.triples { - let subject = dict.decode(triple.subject); - let predicate = dict.decode(triple.predicate); - let object = dict.decode(triple.object); - subjects.entry(subject.unwrap().to_string()).or_default().push((predicate.unwrap().to_string(), object.unwrap().to_string())); - } - drop(dict); - - // For each subject, create an element. - for (subject, po_pairs) in subjects { - xml.push_str(&format!(" \n", subject)); - for (predicate, object) in po_pairs { - xml.push_str(&format!(" <{}>{}\n", predicate, object, predicate)); - } - xml.push_str(" \n"); - } - - xml.push_str("\n"); - xml - } - - pub fn parse_rdf(&mut self, rdf_xml: &str) { - let mut reader = Reader::from_str(rdf_xml); - - let mut current_subject = Vec::with_capacity(128); - let mut current_predicate = Vec::with_capacity(128); - - let (sender, receiver) = unbounded::>(); - let dictionary = Arc::clone(&self.dictionary); - let triples_set = Arc::new(Mutex::new(Vec::new())); - let num_threads = utils::get_num_cpus(); - - // Crossbeam scope to manage threads - scope(|s| { - // Spawn worker threads - for _ in 0..num_threads { - let receiver = receiver.clone(); - let triples_set = Arc::clone(&triples_set); - s.spawn(move |_| { - while let Ok(chunk) = receiver.recv() { - if chunk.is_empty() { - // Termination signal - break; - } - - // Process chunk using Rayon - let local_triples: BTreeSet = - chunk.into_par_iter().map(|triple| triple).collect(); - - // Insert into shared triples set - let mut triples = triples_set.lock().unwrap(); - triples.push(local_triples); - } - }); - } - - // Parsing and sending chunks - let mut triples = Vec::with_capacity(8192); - loop { - match reader.read_event() { - Ok(Event::Start(ref e)) => match e.name() { - QName(b"rdf:RDF") => { - for attr in e.attributes().filter_map(Result::ok) { - let key = attr.key; - let value = attr.value; - if key.as_ref().starts_with(b"xmlns:") { - let prefix = std::str::from_utf8(&key.as_ref()[6..]) - .unwrap_or("") - .to_string(); - let uri = std::str::from_utf8(&value).unwrap_or("").to_string(); - self.prefixes.insert(prefix, uri); - } else if key.as_ref() == b"xmlns" { - // Default namespace - let uri = std::str::from_utf8(&value).unwrap_or("").to_string(); - self.prefixes.insert("".to_string(), uri); - } - } - } - QName(b"rdf:Description") => { - for attr in e.attributes().filter_map(Result::ok) { - if attr.key == QName(b"rdf:about") { - current_subject.truncate(0); - current_subject.extend_from_slice(&attr.value); - } - } - } - QName(b"rdfs:Class") | QName(b"rdf:type") => { - current_predicate.truncate(0); - current_predicate.extend_from_slice(b"rdf:type"); - } - QName(b"rdfs:subClassOf") => { - current_predicate.truncate(0); - current_predicate.extend_from_slice(b"rdfs:subClassOf"); - } - QName(b"rdfs:label") => { - current_predicate.truncate(0); - current_predicate.extend_from_slice(b"rdfs:label"); - } - name => { - let name_str = - std::str::from_utf8(name.as_ref()).unwrap_or("").to_string(); - let resolved_predicate = self.resolve_term(&name_str); - current_predicate = resolved_predicate.clone().into_bytes(); - } - }, - Ok(Event::Empty(ref e)) => { - if let Ok(predicate) = std::str::from_utf8(e.name().as_ref()) { - let resolved_predicate = self.resolve_term(predicate); - let mut object = Vec::with_capacity(128); - for attr in e.attributes().filter_map(Result::ok) { - if attr.key == QName(b"rdf:resource") { - object.extend_from_slice(&attr.value); - } - } - if !object.is_empty() { - if let (Ok(subject_str), Ok(object_str)) = ( - std::str::from_utf8(¤t_subject), - std::str::from_utf8(&object), + let logical_plan = build_logical_plan( + variables.iter().map(|(t, v, _)| (*t, *v)).collect(), + patterns, + filters.clone(), + &prefixes, + self, + &binds, + values_clause.as_ref(), + ); + + //println!("\n[Plan Debug] === UNOPTIMIZED LOGICAL PLAN ==="); + //println!("{:#?}", logical_plan); + + // Fetch database stats & use Streamertail optimizer to find the exact physical execution plan + let stats = self.get_or_build_stats(); + let mut optimizer = Streamertail::with_cached_stats(stats.clone()); + let optimized_plan = optimizer.find_best_plan(&logical_plan); + + //println!("\n[Plan Debug] === OPTIMIZED PHYSICAL PLAN ==="); + //println!("{:#?}\n", optimized_plan); + + let mut bound_vars = HashSet::new(); + + // Values_clause bindings are available from the very beginning + if let Some(vc) = values_clause { + for var in &vc.variables { + let mut v = var.to_string(); + if !v.starts_with('?') { + v = format!("?{}", v); + } + bound_vars.insert(v); + } + } + + // Helper recursive function to walk the PHYSICAL plan execution tree + fn traverse_physical( + op: &PhysicalOperator, + bound_vars: &mut HashSet, + out: &mut Vec, ) { - // Lock the dictionary for encoding - let mut dict = dictionary.write().unwrap(); - let triple = Triple { - subject: dict.encode(subject_str), - predicate: dict.encode(&resolved_predicate), - object: dict.encode(object_str), - }; - drop(dict); // Release the lock - triples.push(triple); - } - } - } - } - Ok(Event::Text(e)) => { - // Use Reader's decode method and trim whitespace - if let Ok(object_str) = reader.decoder().decode(e.as_ref()) { - let trimmed_object = object_str.trim(); - // Skip empty or whitespace-only text - if !trimmed_object.is_empty() { - if let Ok(subject_str) = std::str::from_utf8(¤t_subject) { - if let Ok(predicate_str) = std::str::from_utf8(¤t_predicate) { - let resolved_predicate = self.resolve_term(predicate_str); - // Lock the dictionary for encoding - let mut dict = dictionary.write().unwrap(); - let triple = Triple { - subject: dict.encode(subject_str), - predicate: dict.encode(&resolved_predicate), - object: dict.encode(trimmed_object), - }; - drop(dict); // Release the lock - triples.push(triple); - } - } - } - } - } - Ok(Event::End(ref e)) => { - if e.name() == QName(b"rdf:Description") { - current_subject.truncate(0); - current_predicate.truncate(0); - } - } - Ok(Event::Eof) => break, - Err(e) => { - eprintln!("Error reading XML: {:?}", e); - break; - } - _ => {} - } - - if triples.len() >= 8192 { - sender.send(triples).unwrap(); - triples = Vec::with_capacity(8192); - } - } - - if !triples.is_empty() { - sender.send(triples).unwrap(); - } - - // Send termination signals - for _ in 0..num_threads { - sender.send(Vec::new()).unwrap(); - } - }) - .unwrap(); - - // Merge all BTreeSets into the main triples set - let triples_sets = Arc::try_unwrap(triples_set).unwrap().into_inner().unwrap(); - for local_triples in triples_sets { - self.triples.extend(local_triples); - } - } - - pub fn parse_rdf_from_file(&mut self, filename: &str) { - let file = std::fs::File::open(filename).expect("Cannot open file"); - let reader = std::io::BufReader::new(file); - let mut xml_reader = Reader::from_reader(reader); - - let mut current_subject = Vec::with_capacity(128); - let mut current_predicate = Vec::with_capacity(128); - - // First, read prefixes before spawning worker threads - let mut buf = Vec::new(); - loop { - match xml_reader.read_event_into(&mut buf) { - Ok(Event::Start(ref e)) => { - if e.name() == QName(b"rdf:RDF") { - // Read prefixes - for attr in e.attributes().filter_map(Result::ok) { - let key = attr.key; - let value = attr.value; - if key.as_ref().starts_with(b"xmlns:") { - let prefix = std::str::from_utf8(&key.as_ref()[6..]) - .unwrap_or("") - .to_string(); - let uri = std::str::from_utf8(&value).unwrap_or("").to_string(); - self.prefixes.insert(prefix, uri); - } else if key.as_ref() == b"xmlns" { - // Default namespace - let uri = std::str::from_utf8(&value).unwrap_or("").to_string(); - self.prefixes.insert("".to_string(), uri); - } - } - break; // We have read the prefixes, proceed to the rest - } - } - Ok(Event::Eof) => { - eprintln!("Reached EOF before reading prefixes."); - break; - } - Err(e) => { - eprintln!("Error reading XML: {:?}", e); - break; - } - _ => {} - } - buf.clear(); - } - - // Continue reading and parsing the rest of the file - let mut triples = Vec::with_capacity(8192); - loop { - match xml_reader.read_event_into(&mut buf) { - Ok(Event::Start(ref e)) => match e.name() { - QName(b"rdf:Description") => { - for attr in e.attributes().filter_map(Result::ok) { - if attr.key == QName(b"rdf:about") { - current_subject.clear(); - current_subject.extend_from_slice(&attr.value); - } - } - } - QName(b"rdfs:Class") | QName(b"rdf:type") => { - current_predicate.clear(); - current_predicate.extend_from_slice(b"rdf:type"); - } - QName(b"rdfs:subClassOf") => { - current_predicate.clear(); - current_predicate.extend_from_slice(b"rdfs:subClassOf"); - } - QName(b"rdfs:label") => { - current_predicate.clear(); - current_predicate.extend_from_slice(b"rdfs:label"); - } - name => { - let name_str = std::str::from_utf8(name.as_ref()).unwrap_or("").to_string(); - let resolved_predicate = self.resolve_term(&name_str); - current_predicate = resolved_predicate.clone().into_bytes(); - } - }, - Ok(Event::Empty(ref e)) => { - if let Ok(predicate) = std::str::from_utf8(e.name().as_ref()) { - let resolved_predicate = self.resolve_term(predicate); - let mut object = Vec::with_capacity(128); - for attr in e.attributes().filter_map(Result::ok) { - if attr.key == QName(b"rdf:resource") { - object.extend_from_slice(&attr.value); - } - } - if !object.is_empty() { - if let (Ok(subject_str), Ok(object_str)) = ( - std::str::from_utf8(¤t_subject), - std::str::from_utf8(&object), - ) { + match op { + PhysicalOperator::TableScan { pattern } + | PhysicalOperator::IndexScan { pattern } => { + let (s, p, o) = pattern; + + let bound_subject = match s { + Term::Constant(_) => true, + Term::Variable(v) => bound_vars.contains(v), + }; + let bound_predicate = match p { + Term::Constant(_) => true, + Term::Variable(v) => bound_vars.contains(v), + }; + let bound_object = match o { + Term::Constant(_) => true, + Term::Variable(v) => bound_vars.contains(v), + }; + + out.push(PlannedAccessPattern { + pattern: pattern.clone(), + bound_subject, + bound_predicate, + bound_object, + }); + + // Variables from this scan are now bound for downstream pipeline operations + if let Term::Variable(v) = s { + bound_vars.insert(v.clone()); + } + if let Term::Variable(v) = p { + bound_vars.insert(v.clone()); + } + if let Term::Variable(v) = o { + bound_vars.insert(v.clone()); + } + } + PhysicalOperator::StarJoin { + join_var: _, + patterns, + } => { + let mut sorted_patterns = patterns.clone(); + + // Crucial Fix: StarJoin engine executes patterns by selectivity (most constants first) + // We must sort them exactly how the engine executes them to track variables accurately. + sorted_patterns.sort_by_key(|p| { + let mut constants = 0; + if matches!(p.0, Term::Constant(_)) { + constants += 1; + } + if matches!(p.1, Term::Constant(_)) { + constants += 1; + } + if matches!(p.2, Term::Constant(_)) { + constants += 1; + } + std::cmp::Reverse(constants) + }); + + for pattern in &sorted_patterns { + let (s, p, o) = pattern; + let bound_subject = match s { + Term::Constant(_) => true, + Term::Variable(v) => bound_vars.contains(v), + }; + let bound_predicate = match p { + Term::Constant(_) => true, + Term::Variable(v) => bound_vars.contains(v), + }; + let bound_object = match o { + Term::Constant(_) => true, + Term::Variable(v) => bound_vars.contains(v), + }; + + out.push(PlannedAccessPattern { + pattern: pattern.clone(), + bound_subject, + bound_predicate, + bound_object, + }); + + // Post-scan, its variables are bound for the subsequent internal StarJoin patterns + if let Term::Variable(v) = s { + bound_vars.insert(v.clone()); + } + if let Term::Variable(v) = p { + bound_vars.insert(v.clone()); + } + if let Term::Variable(v) = o { + bound_vars.insert(v.clone()); + } + } + } + PhysicalOperator::NestedLoopJoin { left, right } + | PhysicalOperator::ParallelJoin { left, right } => { + // Pipeline joins: Left executes first, bindings flow completely into right side + traverse_physical(left, bound_vars, out); + traverse_physical(right, bound_vars, out); + } + PhysicalOperator::HashJoin { left, right } + | PhysicalOperator::OptimizedHashJoin { left, right } => { + // Hash joins: Both sides evaluate independently using ONLY the pre-join bounds + let mut left_vars = bound_vars.clone(); + let mut right_vars = bound_vars.clone(); + traverse_physical(left, &mut left_vars, out); + traverse_physical(right, &mut right_vars, out); + + // After execution, the result contains variables from both sides + bound_vars.extend(left_vars); + bound_vars.extend(right_vars); + } + PhysicalOperator::Filter { input, .. } + | PhysicalOperator::Projection { input, .. } => { + traverse_physical(input, bound_vars, out); + } + PhysicalOperator::Subquery { inner, .. } => { + traverse_physical(inner, bound_vars, out); + } + PhysicalOperator::Bind { + input, + output_variable, + .. + } + | PhysicalOperator::MLPredict { + input, + output_variable, + .. + } => { + traverse_physical(input, bound_vars, out); + bound_vars.insert(output_variable.clone()); + } + PhysicalOperator::Values { variables, .. } => { + for var in variables { + let mut v = var.clone(); + if !v.starts_with('?') { + v = format!("?{}", v); + } + bound_vars.insert(v); + } + } + PhysicalOperator::InMemoryBuffer { .. } => {} + } + } + + traverse_physical(&optimized_plan, &mut bound_vars, &mut planned_patterns); + } + } + + planned_patterns + } + + fn resolve_query_patterns(&self, raw_queries: &[String]) -> Vec { + let mut patterns = Vec::new(); + + for query_str in raw_queries { + // parse_sparql_query returns a big tuple; field index 2 + // is the Vec of raw (&str, &str, &str) triple patterns, + // field index 5 is the HashMap of prefixes. + if let Ok((_rest, parsed)) = crate::parser::parse_sparql_query(query_str) { + let raw_patterns = parsed.2; // Vec<(&str, &str, &str)> + let query_prefixes = parsed.5; // HashMap + + // Merge query prefixes with database prefixes + let mut all_prefixes = self.prefixes.clone(); + for (k, v) in query_prefixes { + all_prefixes.insert(k, v); + } + let mut dict = self.dictionary.write().unwrap(); - let triple = Triple { - subject: dict.encode(subject_str), - predicate: dict.encode(&resolved_predicate), - object: dict.encode(object_str), - }; - drop(dict); - triples.push(triple); - } - } - } - } - Ok(Event::Text(e)) => { - // Use Reader's decode method and trim whitespace - if let Ok(object_str) = xml_reader.decoder().decode(e.as_ref()) { - let trimmed_object = object_str.trim(); - // Skip empty or whitespace-only text - if !trimmed_object.is_empty() { - if let Ok(subject_str) = std::str::from_utf8(¤t_subject) { - if let Ok(predicate_str) = std::str::from_utf8(¤t_predicate) { - let resolved_predicate = self.resolve_term(predicate_str); - let mut dict = self.dictionary.write().unwrap(); - let triple = Triple { - subject: dict.encode(subject_str), - predicate: dict.encode(&resolved_predicate), - object: dict.encode(trimmed_object), - }; - drop(dict); - triples.push(triple); - } - } - } - } - } - Ok(Event::End(ref e)) => { - if e.name() == QName(b"rdf:Description") { - current_subject.clear(); - current_predicate.clear(); - } - } - Ok(Event::Eof) => break, - Err(e) => { - eprintln!("Error reading XML: {:?}", e); - break; - } - _ => {} - } - - buf.clear(); - - if triples.len() >= 8192 { - // Process triples in parallel using Rayon - let local_triples: BTreeSet = triples.into_par_iter().collect(); - self.triples.extend(local_triples); - triples = Vec::with_capacity(8192); - } - } - - if !triples.is_empty() { - let local_triples: BTreeSet = triples.into_par_iter().collect(); - self.triples.extend(local_triples); - } - } - - // New parse_turtle function - pub fn parse_turtle(&mut self, turtle_data: &str) { - let lines = turtle_data.lines(); - - for line in lines { - let line = line.trim(); - - // Skip empty lines and comments - if line.is_empty() || line.starts_with("#") { - continue; - } - - // Parse triples - let parts: Vec<&str> = line.split_whitespace().collect(); - if parts.len() >= 3 { - let subject_raw = parts[0].trim_end_matches('.'); - let predicate_raw = parts[1].trim_end_matches('.'); - let object_raw = parts[2..].join(" ").trim_end_matches('.').to_string(); - - // Strip angle brackets from IRIs - let subject = if subject_raw.starts_with('<') && subject_raw.ends_with('>') { - subject_raw[1..subject_raw.len()-1].to_string() - } else { - subject_raw.to_string() - }; + for triple in raw_patterns { + patterns.push(convert_triple_pattern(triple, &mut dict, &all_prefixes)); + } + } + } - let predicate = if predicate_raw.starts_with('<') && predicate_raw.ends_with('>') { - predicate_raw[1..predicate_raw.len()-1].to_string() - } else { - predicate_raw.to_string() - }; + patterns + } - // Clean up object by removing quotes and angle brackets - let object = if object_raw.starts_with('<') && object_raw.ends_with('>') { - object_raw[1..object_raw.len()-1].to_string() - } else if object_raw.starts_with('"') && object_raw.ends_with('"') { - object_raw[1..object_raw.len()-1].to_string() - } else { - object_raw.trim().trim_matches('"').to_string() + pub fn build_all_indexes(&mut self) { + // Memory usage logging + let mut sys = System::new_all(); + let pid = sysinfo::get_current_pid().unwrap(); + sys.refresh_process(pid); + let mem_before = sys.process(pid).unwrap().memory(); + + let triples: Vec = self.triples.iter().cloned().collect(); + + // Clone the config to avoid holding an immutable borrow of `self` + let config = self.index_config.clone(); + + let mut index: Box = match config { + IndexConfig::Hexastore => Box::new(HexastoreIndex::new()), + IndexConfig::SPO => Box::new(SPOSingleIndex::new()), + IndexConfig::POS => Box::new(POSSingleIndex::new()), + IndexConfig::OSP => Box::new(OSPSingleIndex::new()), + IndexConfig::PSO => Box::new(PSOSingleIndex::new()), + IndexConfig::OPS => Box::new(OPSSingleIndex::new()), + IndexConfig::SOP => Box::new(SOPSingleIndex::new()), + IndexConfig::SingleTable => Box::new(SingleTableIndex::new()), + + IndexConfig::DynamicHexastore { + eval_interval, + queries, + } => { + let patterns = self.resolve_query_patterns(&queries); + let eval = eval_interval as usize; + Box::new(DynamicHexastoreIndex::new(patterns, eval)) + } + + IndexConfig::Buckets { queries } => { + // Now it's perfectly fine to borrow `self` mutably! + let patterns = self.resolve_planned_access_patterns(&queries); + Box::new(BucketIndex::new(patterns)) + } // Future index types go here: + // IndexConfig::YourNewIndex { some_param, queries } => { + // let patterns = self.resolve_query_patterns(&queries); + // Box::new(YourNewIndex::new(patterns, some_param)) + // } }; + index.build_from_triples(&triples); + index.optimize(); + + // Memory usage logging + sys.refresh_process(pid); + let mem_after = sys.process(pid).unwrap().memory(); + println!( + "[Memory Debug] Index Build memory cost: {} MB", + (mem_after - mem_before) / 1024 / 1024 + ); + + self.index_manager = Some(index); + } + + /// Get a reference to the index. + /// Panics if `build_all_indexes()` hasn't been called yet. + pub fn index(&self) -> &dyn TripleIndex { + self.index_manager + .as_deref() + .expect("index not built — call build_all_indexes() first") + } + + /// Get a mutable reference to the index. + /// Panics if `build_all_indexes()` hasn't been called yet. + pub fn index_mut(&mut self) -> &mut dyn TripleIndex { + self.index_manager + .as_deref_mut() + .expect("index not built — call build_all_indexes() first") + } + + pub fn get_or_build_stats(&mut self) -> Arc { + if let Some(stats) = &self.cached_stats { + return stats.clone(); // ← Clone the Arc (cheap), not the DatabaseStats + } + + let stats = Arc::new(DatabaseStats::gather_stats_fast(self)); + self.cached_stats = Some(stats.clone()); + stats + } + + pub fn invalidate_stats_cache(&mut self) { + self.cached_stats = None; + } + + pub fn query(&self) -> QueryBuilder<'_> { + QueryBuilder::new(self) + } + + pub fn add_triple(&mut self, triple: Triple) { + self.triples.insert(triple.clone()); + if let Some(ref mut idx) = self.index_manager { + idx.insert(&triple); + } + } + + pub fn delete_triple(&mut self, triple: &Triple) -> bool { + let removed = self.triples.remove(triple); + if removed { + if let Some(ref mut idx) = self.index_manager { + idx.delete(triple); + } + } + removed + } + + /// Helper function that accepts parts of a triple, constructs a Triple, and adds it + pub fn add_triple_parts(&mut self, subject: &str, predicate: &str, object: &str) { let mut dict = self.dictionary.write().unwrap(); + let subject_id = dict.encode(subject); + let predicate_id = dict.encode(predicate); + let object_id = dict.encode(object); + drop(dict); + let triple = Triple { - subject: dict.encode(&subject), - predicate: dict.encode(&predicate), - object: dict.encode(&object), + subject: subject_id, + predicate: predicate_id, + object: object_id, }; + self.add_triple(triple); + } + + /// Helper function that accepts parts of a triple, constructs a Triple, and deletes it + pub fn delete_triple_parts(&mut self, subject: &str, predicate: &str, object: &str) -> bool { + let mut dict = self.dictionary.write().unwrap(); + let subject_id = dict.encode(subject); + let predicate_id = dict.encode(predicate); + let object_id = dict.encode(object); drop(dict); - self.triples.insert(triple); - } else { - eprintln!("Skipping invalid line: {}", line); - } - } - } - - // New parse_n3 function - pub fn parse_n3(&mut self, n3_data: &str) { - let lines: Vec = n3_data.lines().map(|l| l.trim().to_string()).collect(); - let chunk_size = 1000; - let chunks: Vec> = lines - .chunks(chunk_size) - .map(|c| c.to_vec()) - .collect(); - - let partial_results: Vec<(BTreeSet, Arc>, HashMap)> = - chunks.par_iter().map(|chunk| { - let mut local_db = SparqlDatabase::new(); - let mut statement = String::new(); - - for raw_line in chunk { - let mut line = raw_line.as_str(); - if let Some(comment_start) = line.find('#') { - line = &line[..comment_start]; - line = line.trim(); - } - if line.is_empty() { - continue; - } - if line.starts_with("@prefix") { - let line = line.trim_start_matches("@prefix").trim_end_matches('.'); - let parts: Vec<&str> = line.split_whitespace().collect(); - if parts.len() >= 2 { - let prefix = parts[0].trim_end_matches(':').to_string(); - let uri = parts[1].trim_start_matches('<').trim_end_matches('>').to_string(); - local_db.prefixes.insert(prefix, uri); - } else { - eprintln!("Invalid prefix declaration: {}", line); - } - } else { - statement.push_str(line); - statement.push(' '); - if line.ends_with('.') { - local_db.parse_statement(statement.trim()); - statement.clear(); - } - } - } - - (local_db.triples, local_db.dictionary, local_db.prefixes) - }).collect(); - - for (triples, dict_arc, pref) in partial_results { - for t in triples { - self.triples.insert(t); - } - let mut self_dict = self.dictionary.write().unwrap(); - let other_dict = dict_arc.read().unwrap(); - self_dict.merge(&other_dict); - drop(other_dict); - drop(self_dict); - for (k, v) in pref { - self.prefixes.insert(k, v); - } - } - } - - // Parse_ntriples and add to DB function - pub fn parse_ntriples_and_add(&mut self, ntriples_data: &str) { - let partial_results = self.parse_ntriples(ntriples_data); - - let encoded_triples = self.encode_triples(partial_results); - for encoded_triple in encoded_triples{ - self.add_triple(encoded_triple); - } - } - - // Parses ntriples - pub fn parse_ntriples(&mut self, ntriples_data: &str) -> Vec> { - let lines: Vec<&str> = ntriples_data.lines().collect(); - let chunk_size = 1000; - let chunks: Vec<&[&str]> = lines.chunks(chunk_size).collect(); - - let partial_results: Vec> = chunks - .par_iter() - .map(|chunk| { - let mut local_triples = Vec::new(); - - for line in chunk.iter() { - let line = line.trim(); - - // Skip empty lines and comments - if line.is_empty() || line.starts_with('#') { - continue; - } - // N-Triples must end with a dot - if !line.ends_with('.') { - eprintln!("Invalid N-Triples line (missing dot): {}", line); - continue; - } + let triple = Triple { + subject: subject_id, + predicate: predicate_id, + object: object_id, + }; + self.delete_triple(&triple) + } + + pub fn generate_rdf_xml(&mut self) -> String { + let mut xml = String::new(); + xml.push_str("\n"); + xml.push_str("\n"); + + // Group triples by subject + let dict = self.dictionary.read().unwrap(); + let mut subjects: BTreeMap> = BTreeMap::new(); + for triple in &self.triples { + let subject = dict.decode(triple.subject); + let predicate = dict.decode(triple.predicate); + let object = dict.decode(triple.object); + subjects + .entry(subject.unwrap().to_string()) + .or_default() + .push((predicate.unwrap().to_string(), object.unwrap().to_string())); + } + drop(dict); - // Parse the triple - if let Some((subject, predicate, object)) = self.parse_ntriples_line(line_without_dot) { - local_triples.push((subject, predicate, object)); - } + // For each subject, create an element. + for (subject, po_pairs) in subjects { + xml.push_str(&format!(" \n", subject)); + for (predicate, object) in po_pairs { + xml.push_str(&format!(" <{}>{}\n", predicate, object, predicate)); + } + xml.push_str(" \n"); } - local_triples - }) - .collect(); - partial_results - } + xml.push_str("\n"); + xml + } - // Encode triples - pub fn encode_triples(&mut self, non_encoded_triples: Vec>) -> Vec{ - // Merge results with main dictionary - let mut encoded_triples = Vec::new(); - for triple_strings in non_encoded_triples { - for (subject, predicate, object) in triple_strings { - let mut dict = self.dictionary.write().unwrap(); - let main_triple = Triple { - subject: dict.encode(&subject), - predicate: dict.encode(&predicate), - object: dict.encode(&object), - }; - drop(dict); - encoded_triples.push(main_triple); - } - } - encoded_triples - } - - pub fn parse_and_encode_ntriples(&mut self, ntriples_data: &str) -> Vec{ - let partial_results = self.parse_ntriples(ntriples_data); - - self.encode_triples(partial_results) - } - - // Helper method to parse a single N-Triples line - fn parse_ntriples_line(&self, line: &str) -> Option<(String, String, String)> { - let mut parts = Vec::new(); - let mut current_part = String::new(); - let mut in_uri = false; - let mut in_literal = false; - let mut escaped = false; - let mut chars = line.chars().peekable(); - - while let Some(ch) = chars.next() { - match ch { - '<' if !in_literal && !escaped => { - in_uri = true; - current_part.push(ch); - } - '>' if in_uri && !escaped => { - in_uri = false; - current_part.push(ch); - parts.push(current_part.trim().to_string()); - current_part.clear(); - } - '"' if !in_uri && !escaped => { - in_literal = !in_literal; - current_part.push(ch); - if !in_literal { - // Check for datatype or language tag after closing quote - while let Some(&next_ch) = chars.peek() { - if next_ch == '^' || next_ch == '@' { - current_part.push(chars.next().unwrap()); - // Handle ^^ for datatypes - if next_ch == '^' { - if let Some(&second_caret) = chars.peek() { - if second_caret == '^' { - current_part.push(chars.next().unwrap()); - // Now consume the datatype URI - while let Some(&datatype_ch) = chars.peek() { - if datatype_ch == '<' { - // Start of datatype URI - current_part.push(chars.next().unwrap()); - let mut in_datatype_uri = true; - while let Some(&uri_ch) = chars.peek() { - current_part.push(chars.next().unwrap()); - if uri_ch == '>' { - in_datatype_uri = false; - break; - } - } - if !in_datatype_uri { + pub fn parse_rdf(&mut self, rdf_xml: &str) { + let mut reader = Reader::from_str(rdf_xml); + + let mut current_subject = Vec::with_capacity(128); + let mut current_predicate = Vec::with_capacity(128); + + let (sender, receiver) = unbounded::>(); + let dictionary = Arc::clone(&self.dictionary); + let triples_set = Arc::new(Mutex::new(Vec::new())); + let num_threads = utils::get_num_cpus(); + + // Crossbeam scope to manage threads + scope(|s| { + // Spawn worker threads + for _ in 0..num_threads { + let receiver = receiver.clone(); + let triples_set = Arc::clone(&triples_set); + s.spawn(move |_| { + while let Ok(chunk) = receiver.recv() { + if chunk.is_empty() { + // Termination signal break; - } - } else if datatype_ch.is_whitespace() { - break; - } else { - current_part.push(chars.next().unwrap()); } - } + + // Process chunk using Rayon + let local_triples: BTreeSet = + chunk.into_par_iter().map(|triple| triple).collect(); + + // Insert into shared triples set + let mut triples = triples_set.lock().unwrap(); + triples.push(local_triples); } - } - } else if next_ch == '@' { - // Language tag - while let Some(&lang_ch) = chars.peek() { - if lang_ch.is_alphanumeric() || lang_ch == '-' { - current_part.push(chars.next().unwrap()); - } else { - break; + }); + } + + // Parsing and sending chunks + let mut triples = Vec::with_capacity(8192); + loop { + match reader.read_event() { + Ok(Event::Start(ref e)) => match e.name() { + QName(b"rdf:RDF") => { + for attr in e.attributes().filter_map(Result::ok) { + let key = attr.key; + let value = attr.value; + if key.as_ref().starts_with(b"xmlns:") { + let prefix = std::str::from_utf8(&key.as_ref()[6..]) + .unwrap_or("") + .to_string(); + let uri = std::str::from_utf8(&value).unwrap_or("").to_string(); + self.prefixes.insert(prefix, uri); + } else if key.as_ref() == b"xmlns" { + // Default namespace + let uri = std::str::from_utf8(&value).unwrap_or("").to_string(); + self.prefixes.insert("".to_string(), uri); + } + } + } + QName(b"rdf:Description") => { + for attr in e.attributes().filter_map(Result::ok) { + if attr.key == QName(b"rdf:about") { + current_subject.truncate(0); + current_subject.extend_from_slice(&attr.value); + } + } + } + QName(b"rdfs:Class") | QName(b"rdf:type") => { + current_predicate.truncate(0); + current_predicate.extend_from_slice(b"rdf:type"); + } + QName(b"rdfs:subClassOf") => { + current_predicate.truncate(0); + current_predicate.extend_from_slice(b"rdfs:subClassOf"); + } + QName(b"rdfs:label") => { + current_predicate.truncate(0); + current_predicate.extend_from_slice(b"rdfs:label"); + } + name => { + let name_str = + std::str::from_utf8(name.as_ref()).unwrap_or("").to_string(); + let resolved_predicate = self.resolve_term(&name_str); + current_predicate = resolved_predicate.clone().into_bytes(); + } + }, + Ok(Event::Empty(ref e)) => { + if let Ok(predicate) = std::str::from_utf8(e.name().as_ref()) { + let resolved_predicate = self.resolve_term(predicate); + let mut object = Vec::with_capacity(128); + for attr in e.attributes().filter_map(Result::ok) { + if attr.key == QName(b"rdf:resource") { + object.extend_from_slice(&attr.value); + } + } + if !object.is_empty() { + if let (Ok(subject_str), Ok(object_str)) = ( + std::str::from_utf8(¤t_subject), + std::str::from_utf8(&object), + ) { + // Lock the dictionary for encoding + let mut dict = dictionary.write().unwrap(); + let triple = Triple { + subject: dict.encode(subject_str), + predicate: dict.encode(&resolved_predicate), + object: dict.encode(object_str), + }; + drop(dict); // Release the lock + triples.push(triple); + } + } + } + } + Ok(Event::Text(e)) => { + // Use Reader's decode method and trim whitespace + if let Ok(object_str) = reader.decoder().decode(e.as_ref()) { + let trimmed_object = object_str.trim(); + // Skip empty or whitespace-only text + if !trimmed_object.is_empty() { + if let Ok(subject_str) = std::str::from_utf8(¤t_subject) { + if let Ok(predicate_str) = + std::str::from_utf8(¤t_predicate) + { + let resolved_predicate = self.resolve_term(predicate_str); + // Lock the dictionary for encoding + let mut dict = dictionary.write().unwrap(); + let triple = Triple { + subject: dict.encode(subject_str), + predicate: dict.encode(&resolved_predicate), + object: dict.encode(trimmed_object), + }; + drop(dict); // Release the lock + triples.push(triple); + } + } + } + } } - } + Ok(Event::End(ref e)) => { + if e.name() == QName(b"rdf:Description") { + current_subject.truncate(0); + current_predicate.truncate(0); + } + } + Ok(Event::Eof) => break, + Err(e) => { + eprintln!("Error reading XML: {:?}", e); + break; + } + _ => {} + } + + if triples.len() >= 8192 { + sender.send(triples).unwrap(); + triples = Vec::with_capacity(8192); } - break; - } else if next_ch.is_whitespace() { - break; - } else { - // Unexpected character after literal - break; - } } - parts.push(current_part.trim().to_string()); - current_part.clear(); - } - } - '\\' if (in_uri || in_literal) && !escaped => { - escaped = true; - current_part.push(ch); - } - ' ' | '\t' if !in_uri && !in_literal && !escaped => { - if !current_part.is_empty() { - parts.push(current_part.trim().to_string()); - current_part.clear(); - } - } - _ => { - escaped = false; - current_part.push(ch); - } - } - } - - if !current_part.is_empty() { - parts.push(current_part.trim().to_string()); - } - - if parts.len() == 3 { - let subject = self.clean_ntriples_term(&parts[0]); - // Expand the Turtle `a` shorthand for rdf:type in predicate position. - let predicate = if parts[1] == "a" { - "http://www.w3.org/1999/02/22-rdf-syntax-ns#type".to_string() - } else { - self.clean_ntriples_term(&parts[1]) - }; - let object = self.clean_ntriples_term(&parts[2]); - Some((subject, predicate, object)) - } else { - eprintln!("Invalid N-Triples line (expected 3 parts, got {}): {}", parts.len(), line); - None - } - } - - // Helper method to clean N-Triples terms - fn clean_ntriples_term(&self, term: &str) -> String { - let term = term.trim(); - - // Handle URIs - if term.starts_with('<') && term.ends_with('>') { - return term[1..term.len()-1].to_string(); - } - - // Handle literals (keep quotes and datatype/language info) - if term.starts_with('"') { - if let Some(close_quote_pos) = term[1..].find('"') { - let close_quote_pos = close_quote_pos + 1; - let literal_value = &term[1..close_quote_pos]; - let rest = &term[close_quote_pos + 1..]; - if rest.is_empty() { - return literal_value.to_string(); - } else if rest.starts_with("^^") { - return literal_value.to_string(); - } else if rest.starts_with("@") { - return format!("{}{}", literal_value, rest); - } - } - } - - // Return as-is for other cases - term.to_string() - } - - fn parse_statement(&mut self, statement: &str) { - let mut tokens = statement.split_whitespace().peekable(); - let mut subject = String::new(); - let mut predicate = String::new(); - let mut current_state = "subject"; - - while let Some(token) = tokens.next() { - match token { - ";" => { - predicate.clear(); - current_state = "predicate"; - } - "." => { - // End of statement - break; - } - _ => match current_state { - "subject" => { - subject = token.to_string(); - current_state = "predicate"; - } - "predicate" => { - predicate = token.to_string(); - current_state = "object"; - } - "object" => { - let mut object = token.to_string(); - - // Collect tokens until we reach ';', '.', or ',' - while let Some(next_token) = tokens.peek() { - if *next_token == ";" || *next_token == "." || *next_token == "," { - break; - } - // Consume the token - let next_token = tokens.next().unwrap(); - object.push(' '); - object.push_str(next_token); - } - - // Resolve terms and store the triple - let resolved_subject = self.resolve_term(&subject); - let resolved_predicate = self.resolve_term(&predicate); - let resolved_object = self.resolve_term(&object); - - let mut dict = self.dictionary.write().unwrap(); - let triple = Triple { - subject: dict.encode(&resolved_subject), - predicate: dict.encode(&resolved_predicate), - object: dict.encode(&resolved_object), - }; - drop(dict); - self.triples.insert(triple); - - current_state = "predicate"; - } - _ => {} - }, - } - } - } - - fn resolve_term(&self, term: &str) -> String { - if term.starts_with('<') && term.ends_with('>') { - term.trim_start_matches('<') - .trim_end_matches('>') - .to_string() - } else if term.starts_with('"') { - // It's a literal, possibly with a datatype or language tag - if let Some(pos) = term.rfind('"') { - let literal = &term[..=pos]; // Include the closing quote - let rest = &term[pos + 1..]; // After the closing quote - let mut result = literal.to_string(); - if rest.starts_with("^^") { - // It's a typed literal - let datatype = rest[2..].trim(); - let resolved_datatype = self.resolve_term(datatype); - result.push_str("^^"); - result.push_str(&resolved_datatype); - } else if rest.starts_with('@') { - // It's a language-tagged literal - result.push_str(rest); - } - result - } else { - // Malformed literal - term.to_string() - } - } else if term.contains(':') - && !term.starts_with("http://") - && !term.starts_with("https://") - { - let mut parts = term.splitn(2, ':'); - let prefix = parts.next().unwrap(); - let local_name = parts.next().unwrap_or(""); - if let Some(uri) = self.prefixes.get(prefix) { - format!("{}{}", uri, local_name) - } else { - eprintln!("Unknown prefix: {}", prefix); - term.to_string() - } - } else { - term.to_string() - } - } - - // Method to automatically extract and register prefixes from a query string - pub fn register_prefixes_from_query(&mut self, query: &str) { - // Simple regex to extract PREFIX declarations - let prefix_pattern = regex::Regex::new(r"PREFIX\s+([a-zA-Z0-9_]+):\s*<([^>]+)>").unwrap(); - - for captures in prefix_pattern.captures_iter(query) { - if captures.len() >= 3 { - let prefix = captures[1].to_string(); - let uri = captures[2].to_string(); - self.prefixes.insert(prefix, uri); - } - } - } - - // Method to ensure prefixes are properly shared between components - pub fn share_prefixes_with(&self, prefixes: &mut HashMap) { - for (prefix, uri) in &self.prefixes { - prefixes.insert(prefix.clone(), uri.clone()); - } - } - - pub fn resolve_query_term(&self, term: &str, prefixes: &HashMap) -> String { - if term.starts_with('<') && term.ends_with('>') { - term.trim_start_matches('<') - .trim_end_matches('>') - .to_string() - } else if term.starts_with('"') && term.ends_with('"') { - term.trim_matches('"').to_string() - } else if term.contains(':') - && !term.starts_with("http://") - && !term.starts_with("https://") - { - let mut parts = term.splitn(2, ':'); - let prefix = parts.next().unwrap(); - let local_name = parts.next().unwrap_or(""); - - // First check the passed prefixes map - if let Some(uri) = prefixes.get(prefix) { - format!("{}{}", uri, local_name) - } - // Then check the database's own prefixes map as a fallback - else if let Some(uri) = self.prefixes.get(prefix) { - format!("{}{}", uri, local_name) - } else { - eprintln!("Unknown prefix in query: {}", prefix); - term.to_string() - } - } else { - term.to_string() - } - } - - pub fn add_stream_data(&mut self, triple: Triple, timestamp: u64) { - self.streams.push(TimestampedTriple { triple, timestamp }); - } - - pub fn time_based_window(&self, start: u64, end: u64) -> BTreeSet { - self.streams - .iter() - .filter(|ts_triple| ts_triple.timestamp >= start && ts_triple.timestamp <= end) - .map(|ts_triple| ts_triple.triple.clone()) - .collect() - } - - pub fn apply_filters_simd<'a>( - &self, - results: Vec>, - filters: Vec>, - ) -> Vec> { - results - .into_iter() - .filter(|result| { - filters.iter().all(|filter_expr| { - match filter_expr { - FilterExpression::Comparison(var, operator, value) => { - // Check if either side contains arithmetic operations - let has_arithmetic = var.contains('+') || var.contains('-') || - var.contains('*') || var.contains('/') || - value.contains('+') || value.contains('-') || - value.contains('*') || value.contains('/'); - - if has_arithmetic { - // Use the non-SIMD arithmetic expression evaluator for complex expressions - let left_result = self.evaluate_arithmetic_string(result, var); - let right_result = self.evaluate_arithmetic_string(result, value); - match (left_result, right_result) { - (Ok(left_val), Ok(right_val)) => { - // Both sides are numeric, perform comparison - match *operator { - "=" => left_val == right_val, - "!=" => left_val != right_val, - ">" => left_val > right_val, - ">=" => left_val >= right_val, - "<" => left_val < right_val, - "<=" => left_val <= right_val, - _ => false, - } - }, - _ => false // At least one expression couldn't be evaluated - } - } else { - // For simple expressions without arithmetic operators, use the SIMD approach - if let Some(var_value_str) = result.get(var) { - // First, try parsing both values as numbers - let var_value_num = var_value_str.parse::(); - let filter_value_num = value.parse::(); - - if var_value_num.is_ok() && filter_value_num.is_ok() { - // Both values are numeric, perform SIMD numeric comparison - let var_value = var_value_num.unwrap(); - let filter_value = filter_value_num.unwrap(); - - // On x86 (SSE2) or x86_64 (SSE2) use SIMD intrinsics - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - { - unsafe { - // Load values into SIMD registers - let var_simd = _mm_set1_epi32(var_value); - let filter_simd = _mm_set1_epi32(filter_value); - return match *operator { - "=" => _mm_movemask_epi8(_mm_cmpeq_epi32( - var_simd, - filter_simd, - )) == 0xFFFF, - "!=" => _mm_movemask_epi8(_mm_cmpeq_epi32( - var_simd, - filter_simd, - )) != 0xFFFF, - ">" => _mm_movemask_epi8(_mm_cmpgt_epi32( - var_simd, - filter_simd, - )) == 0xFFFF, - ">=" => { - let eq = _mm_cmpeq_epi32(var_simd, filter_simd); - let gt = _mm_cmpgt_epi32(var_simd, filter_simd); - _mm_movemask_epi8(_mm_or_si128(eq, gt)) == 0xFFFF - } - "<" => _mm_movemask_epi8(_mm_cmpgt_epi32( - filter_simd, - var_simd, - )) == 0xFFFF, - "<=" => { - let eq = _mm_cmpeq_epi32(var_simd, filter_simd); - let lt = _mm_cmpgt_epi32(filter_simd, var_simd); - _mm_movemask_epi8(_mm_or_si128(eq, lt)) == 0xFFFF - } - _ => false, - }; - } - } + if !triples.is_empty() { + sender.send(triples).unwrap(); + } - // On ARM (aarch64) use NEON intrinsics - #[cfg(target_arch = "aarch64")] - { - unsafe { - let var_neon = vdupq_n_s32(var_value); - let filter_neon = vdupq_n_s32(filter_value); - return match *operator { - "=" => { - let cmp = vceqq_s32(var_neon, filter_neon); - (vgetq_lane_u32(cmp, 0) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 1) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 2) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 3) == 0xFFFFFFFF) - } - "!=" => { - let cmp = vceqq_s32(var_neon, filter_neon); - !((vgetq_lane_u32(cmp, 0) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 1) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 2) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 3) == 0xFFFFFFFF)) - } - ">" => { - let cmp = vcgtq_s32(var_neon, filter_neon); - (vgetq_lane_u32(cmp, 0) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 1) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 2) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 3) == 0xFFFFFFFF) - } - ">=" => { - let eq = vceqq_s32(var_neon, filter_neon); - let gt = vcgtq_s32(var_neon, filter_neon); - let cmp = vorrq_u32(eq, gt); - (vgetq_lane_u32(cmp, 0) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 1) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 2) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 3) == 0xFFFFFFFF) - } - "<" => { - let cmp = vcgtq_s32(filter_neon, var_neon); - (vgetq_lane_u32(cmp, 0) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 1) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 2) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 3) == 0xFFFFFFFF) - } - "<=" => { - let eq = vceqq_s32(var_neon, filter_neon); - let lt = vcgtq_s32(filter_neon, var_neon); - let cmp = vorrq_u32(eq, lt); - (vgetq_lane_u32(cmp, 0) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 1) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 2) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 3) == 0xFFFFFFFF) - } - _ => false, - } - } - } - - // Fallback (or if compiled for a non‐SIMD platform) - #[cfg(not(any( - target_arch = "x86", - target_arch = "x86_64", - target_arch = "aarch64" - )))] - { - return match *operator { - "=" => var_value == filter_value, - "!=" => var_value != filter_value, - ">" => var_value > filter_value, - ">=" => var_value >= filter_value, - "<" => var_value < filter_value, - "<=" => var_value <= filter_value, - _ => false, - }; - } - } else { - // At least one value is a string, perform string comparison - let var_bytes = var_value_str.as_bytes(); - let filter_bytes = value.as_bytes(); - - let var_len = var_bytes.len(); - let filter_len = filter_bytes.len(); - - // If lengths differ, they can't be equal - if var_len != filter_len { - return match *operator { - "=" => false, - "!=" => true, - _ => false, // Other operators are not supported for strings - }; - } - - let mut i = 0; - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - { - unsafe { - while i + 16 <= var_len { - let var_chunk = _mm_loadu_si128( - var_bytes[i..].as_ptr() as *const __m128i, - ); - let filter_chunk = _mm_loadu_si128( - filter_bytes[i..].as_ptr() as *const __m128i, - ); - let cmp = _mm_cmpeq_epi8(var_chunk, filter_chunk); - let mask = _mm_movemask_epi8(cmp); - if mask != 0xFFFF { - return match *operator { - "=" => false, - "!=" => true, - _ => false, - }; - } - i += 16; + // Send termination signals + for _ in 0..num_threads { + sender.send(Vec::new()).unwrap(); + } + }) + .unwrap(); + + // Merge all BTreeSets into the main triples set + let triples_sets = Arc::try_unwrap(triples_set).unwrap().into_inner().unwrap(); + for local_triples in triples_sets { + self.triples.extend(local_triples); + } + } + + pub fn parse_rdf_from_file(&mut self, filename: &str) { + let file = std::fs::File::open(filename).expect("Cannot open file"); + let reader = std::io::BufReader::new(file); + let mut xml_reader = Reader::from_reader(reader); + + let mut current_subject = Vec::with_capacity(128); + let mut current_predicate = Vec::with_capacity(128); + + // First, read prefixes before spawning worker threads + let mut buf = Vec::new(); + loop { + match xml_reader.read_event_into(&mut buf) { + Ok(Event::Start(ref e)) => { + if e.name() == QName(b"rdf:RDF") { + // Read prefixes + for attr in e.attributes().filter_map(Result::ok) { + let key = attr.key; + let value = attr.value; + if key.as_ref().starts_with(b"xmlns:") { + let prefix = std::str::from_utf8(&key.as_ref()[6..]) + .unwrap_or("") + .to_string(); + let uri = std::str::from_utf8(&value).unwrap_or("").to_string(); + self.prefixes.insert(prefix, uri); + } else if key.as_ref() == b"xmlns" { + // Default namespace + let uri = std::str::from_utf8(&value).unwrap_or("").to_string(); + self.prefixes.insert("".to_string(), uri); + } } - } + break; // We have read the prefixes, proceed to the rest } - - #[cfg(target_arch = "aarch64")] - { - unsafe { - while i + 16 <= var_len { - let var_chunk = vld1q_u8(var_bytes[i..].as_ptr()); - let filter_chunk = vld1q_u8(filter_bytes[i..].as_ptr()); - let cmp = vceqq_u8(var_chunk, filter_chunk); - let cmp_arr: [u8; 16] = std::mem::transmute(cmp); - if cmp_arr.iter().any(|&lane| lane != 0xFF) { - return match *operator { - "=" => false, - "!=" => true, - _ => false, - }; - } - i += 16; + } + Ok(Event::Eof) => { + eprintln!("Reached EOF before reading prefixes."); + break; + } + Err(e) => { + eprintln!("Error reading XML: {:?}", e); + break; + } + _ => {} + } + buf.clear(); + } + + // Continue reading and parsing the rest of the file + let mut triples = Vec::with_capacity(8192); + loop { + match xml_reader.read_event_into(&mut buf) { + Ok(Event::Start(ref e)) => match e.name() { + QName(b"rdf:Description") => { + for attr in e.attributes().filter_map(Result::ok) { + if attr.key == QName(b"rdf:about") { + current_subject.clear(); + current_subject.extend_from_slice(&attr.value); + } } - } } - - // Handle remaining bytes - if i < var_len { - for j in i..var_len { - if var_bytes[j] != filter_bytes[j] { - return match *operator { - "=" => false, - "!=" => true, - _ => false, - }; + QName(b"rdfs:Class") | QName(b"rdf:type") => { + current_predicate.clear(); + current_predicate.extend_from_slice(b"rdf:type"); + } + QName(b"rdfs:subClassOf") => { + current_predicate.clear(); + current_predicate.extend_from_slice(b"rdfs:subClassOf"); + } + QName(b"rdfs:label") => { + current_predicate.clear(); + current_predicate.extend_from_slice(b"rdfs:label"); + } + name => { + let name_str = std::str::from_utf8(name.as_ref()).unwrap_or("").to_string(); + let resolved_predicate = self.resolve_term(&name_str); + current_predicate = resolved_predicate.clone().into_bytes(); + } + }, + Ok(Event::Empty(ref e)) => { + if let Ok(predicate) = std::str::from_utf8(e.name().as_ref()) { + let resolved_predicate = self.resolve_term(predicate); + let mut object = Vec::with_capacity(128); + for attr in e.attributes().filter_map(Result::ok) { + if attr.key == QName(b"rdf:resource") { + object.extend_from_slice(&attr.value); + } + } + if !object.is_empty() { + if let (Ok(subject_str), Ok(object_str)) = ( + std::str::from_utf8(¤t_subject), + std::str::from_utf8(&object), + ) { + let mut dict = self.dictionary.write().unwrap(); + let triple = Triple { + subject: dict.encode(subject_str), + predicate: dict.encode(&resolved_predicate), + object: dict.encode(object_str), + }; + drop(dict); + triples.push(triple); + } } - } } - - // Strings are equal - match *operator { - "=" => true, - "!=" => false, - _ => false, // Other operators not supported for strings + } + Ok(Event::Text(e)) => { + // Use Reader's decode method and trim whitespace + if let Ok(object_str) = xml_reader.decoder().decode(e.as_ref()) { + let trimmed_object = object_str.trim(); + // Skip empty or whitespace-only text + if !trimmed_object.is_empty() { + if let Ok(subject_str) = std::str::from_utf8(¤t_subject) { + if let Ok(predicate_str) = std::str::from_utf8(¤t_predicate) { + let resolved_predicate = self.resolve_term(predicate_str); + let mut dict = self.dictionary.write().unwrap(); + let triple = Triple { + subject: dict.encode(subject_str), + predicate: dict.encode(&resolved_predicate), + object: dict.encode(trimmed_object), + }; + drop(dict); + triples.push(triple); + } + } + } } - } - } else { - false } - } - }, - FilterExpression::And(left, right) => { - self.evaluate_filter_expression(result, left) && - self.evaluate_filter_expression(result, right) - }, - FilterExpression::Or(left, right) => { - self.evaluate_filter_expression(result, left) || - self.evaluate_filter_expression(result, right) - }, - FilterExpression::Not(expr) => { - !self.evaluate_filter_expression(result, expr) - }, - FilterExpression::ArithmeticExpr(expr_str) => { - // True if it's non-zero - match self.evaluate_arithmetic_string(result, expr_str) { - Ok(val) => val != 0.0, - Err(_) => false, - } + Ok(Event::End(ref e)) => { + if e.name() == QName(b"rdf:Description") { + current_subject.clear(); + current_predicate.clear(); + } + } + Ok(Event::Eof) => break, + Err(e) => { + eprintln!("Error reading XML: {:?}", e); + break; + } + _ => {} } - } - }) - }) - .collect() - } - - // Helper function to evaluate an arithmetic expression - fn evaluate_arithmetic_expression<'a>( - &self, - result: &BTreeMap<&'a str, String>, - expr: &shared::query::ArithmeticExpression<'a> - ) -> Result { - match expr { - shared::query::ArithmeticExpression::Operand(operand) => { - // Check if it's a variable - if operand.starts_with('?') { - if let Some(var_value) = result.get(*operand) { - // Parse the variable value as a number - var_value.parse::().map_err(|_| format!("Cannot parse '{}' as a number", var_value)) - } else { - Err(format!("Variable '{}' not found", operand)) - } - } - // Check if it's a numeric literal - else if operand.chars().all(|c| c.is_digit(10) || c == '.') { - operand.parse::().map_err(|_| format!("Cannot parse '{}' as a number", operand)) - } - // Check if it's a string literal - else if operand.starts_with('"') && operand.ends_with('"') { - Err(format!("Cannot perform arithmetic on string literal '{}'", operand)) - } - // Parse it as a number - else { - operand.parse::().map_err(|_| format!("Cannot parse '{}' as a number", operand)) - } - }, - shared::query::ArithmeticExpression::Add(left, right) => { - let left_val = self.evaluate_arithmetic_expression(result, left)?; - let right_val = self.evaluate_arithmetic_expression(result, right)?; - Ok(left_val + right_val) - }, - shared::query::ArithmeticExpression::Subtract(left, right) => { - let left_val = self.evaluate_arithmetic_expression(result, left)?; - let right_val = self.evaluate_arithmetic_expression(result, right)?; - Ok(left_val - right_val) - }, - shared::query::ArithmeticExpression::Multiply(left, right) => { - let left_val = self.evaluate_arithmetic_expression(result, left)?; - let right_val = self.evaluate_arithmetic_expression(result, right)?; - Ok(left_val * right_val) - }, - shared::query::ArithmeticExpression::Divide(left, right) => { - let left_val = self.evaluate_arithmetic_expression(result, left)?; - let right_val = self.evaluate_arithmetic_expression(result, right)?; - if right_val == 0.0 { - Err("Division by zero".to_string()) - } else { - Ok(left_val / right_val) - } - } - } - } - - // Helper function to parse and evaluate an arithmetic expression from a string - fn evaluate_arithmetic_string<'a>( - &self, - result: &BTreeMap<&'a str, String>, - expr_str: &'a str - ) -> Result { - // Check for parenthesized expressions and remove them if needed - let expr_to_parse = if expr_str.starts_with('(') && expr_str.ends_with(')') { - &expr_str[1..expr_str.len()-1] - } else { - expr_str - }; - - if expr_to_parse.contains('+') || expr_to_parse.contains('-') || - expr_to_parse.contains('*') || expr_to_parse.contains('/') { - // Parse the expression string into an ArithmeticExpression - match parser::parse_arithmetic_expression(expr_to_parse) { - Ok((_, arithmetic_expr)) => { - // Evaluate the parsed expression - self.evaluate_arithmetic_expression(result, &arithmetic_expr) - }, - Err(e) => { - // Print the error - eprintln!("Failed to parse arithmetic expression '{}': {:?}", expr_to_parse, e); - - // If parsing fails, try to treat it as a simple operand - if expr_to_parse.starts_with('?') { - // It's a variable - if let Some(var_value) = result.get(expr_to_parse) { - var_value.parse::().map_err(|_| format!("Cannot parse '{}' as a number", var_value)) - } else { - Err(format!("Variable '{}' not found", expr_to_parse)) - } - } else { - // Parse as a number - expr_to_parse.parse::().map_err(|_| format!("Cannot parse '{}' as a number", expr_to_parse)) - } - } - } - } else { - // No arithmetic operators, treat as simple operand - if expr_to_parse.starts_with('?') { - // It's a variable - if let Some(var_value) = result.get(expr_to_parse) { - var_value.parse::().map_err(|_| format!("Cannot parse '{}' as a number", var_value)) - } else { - Err(format!("Variable '{}' not found", expr_to_parse)) - } - } else { - // Parse as a number - expr_to_parse.parse::().map_err(|_| format!("Cannot parse '{}' as a number", expr_to_parse)) - } - } - } - - // Helper method to evaluate a filter expression against a result - fn evaluate_filter_expression<'a>( - &self, - result: &BTreeMap<&'a str, String>, - filter_expr: &FilterExpression<'a> - ) -> bool { - match filter_expr { - FilterExpression::Comparison(left, operator, right) => { - // Evaluate both sides as arithmetic expressions - let left_result = self.evaluate_arithmetic_string(result, left); - let right_result = self.evaluate_arithmetic_string(result, right); - - match (left_result, right_result) { - (Ok(left_val), Ok(right_val)) => { - // Both sides are numeric, perform numeric comparison - match *operator { - "=" => left_val == right_val, - "!=" => left_val != right_val, - ">" => left_val > right_val, - ">=" => left_val >= right_val, - "<" => left_val < right_val, - "<=" => left_val <= right_val, - _ => false, - } - }, - _ => { - let left_str = if left.starts_with('?') { - // Fix for the type mismatch error - convert to string - match result.get(left) { - Some(val) => val.as_str(), - None => left, - } - } else { - left - }; - let right_str = if right.starts_with('?') { - // Fix for the type mismatch error - convert to string - match result.get(right) { - Some(val) => val.as_str(), - None => right, - } - } else { - right - }; + buf.clear(); - match *operator { - "=" => left_str == right_str, - "!=" => left_str != right_str, - _ => false, // Other operators not supported for strings - } - } - } - }, - FilterExpression::And(left, right) => { - self.evaluate_filter_expression(result, left) && - self.evaluate_filter_expression(result, right) - }, - FilterExpression::Or(left, right) => { - self.evaluate_filter_expression(result, left) || - self.evaluate_filter_expression(result, right) - }, - FilterExpression::Not(expr) => { - !self.evaluate_filter_expression(result, expr) - }, - FilterExpression::ArithmeticExpr(expr_str) => { - // An arithmetic expression by itself is evaluated to true if it's non-zero - match self.evaluate_arithmetic_string(result, expr_str) { - Ok(val) => val != 0.0, - Err(_) => false, - } - } - } - } - - pub fn union(&mut self, other: &SparqlDatabase) -> Self { - // Create a new dictionary by cloning and merging - let self_dict = self.dictionary.read().unwrap(); - let other_dict = other.dictionary.read().unwrap(); - let mut merged_dictionary = self_dict.clone(); - drop(self_dict); - - // Re-encode triples from the other database using the merged dictionary - let mut re_encoded_triples = BTreeSet::new(); - for triple in &other.triples { - let subject = - merged_dictionary.encode(other_dict.decode(triple.subject).unwrap()); - let predicate = - merged_dictionary.encode(other_dict.decode(triple.predicate).unwrap()); - let object = merged_dictionary.encode(other_dict.decode(triple.object).unwrap()); - re_encoded_triples.insert(Triple { - subject, - predicate, - object, - }); - } - - // Merge the triples and streams - let union_triples: BTreeSet = - self.triples.union(&re_encoded_triples).cloned().collect(); - let mut union_streams = self.streams.clone(); - for ts_triple in &other.streams { - let subject = merged_dictionary - .encode(other_dict.decode(ts_triple.triple.subject).unwrap()); - let predicate = merged_dictionary - .encode(other_dict.decode(ts_triple.triple.predicate).unwrap()); - let object = - merged_dictionary.encode(other_dict.decode(ts_triple.triple.object).unwrap()); - let re_encoded_ts_triple = TimestampedTriple { - triple: Triple { - subject, - predicate, - object, - }, - timestamp: ts_triple.timestamp, - }; - if !union_streams.contains(&re_encoded_ts_triple) { - union_streams.push(re_encoded_ts_triple); - } - } - drop(other_dict); - - Self { - triples: union_triples, - streams: union_streams, - sliding_window: self.sliding_window.clone(), - dictionary: Arc::new(RwLock::new(merged_dictionary)), - prefixes: self.prefixes.clone(), - udfs: HashMap::new(), - index_manager: Some(self.index().clone_empty()), - rule_map: HashMap::new(), - cached_stats: None, - index_config: self.index_config.clone() - } - } - - pub fn par_join(&mut self, other: &SparqlDatabase, predicate: &str) -> Self { - let mut dict = self.dictionary.write().unwrap(); - let predicate_id = dict.encode(predicate); - drop(dict); - let other_map: BTreeMap<&u32, Vec<&Triple>> = other - .triples - .par_iter() - .filter(|other_triple| other_triple.predicate == predicate_id) - .flat_map(|other_triple| { - vec![ - (&other_triple.subject, other_triple), - (&other_triple.object, other_triple), - ] - }) - .fold( - || BTreeMap::new(), - |mut acc, (key, triple)| { - acc.entry(key).or_insert_with(Vec::new).push(triple); - acc - }, - ) - .reduce( - || BTreeMap::new(), - |mut acc, map| { - for (key, triples) in map { - acc.entry(key).or_insert_with(Vec::new).extend(triples); - } - acc - }, - ); - - let joined_triples: BTreeSet = self - .triples - .par_iter() - .filter(|triple| triple.predicate == predicate_id) - .fold( - || BTreeSet::new(), - |mut local_set, triple| { - if let Some(matching_triples) = other_map.get(&triple.object) { - for other_triple in matching_triples { - local_set.insert(Triple { - subject: triple.subject, - predicate: other_triple.predicate, - object: other_triple.object, - }); - } - } - local_set - }, - ) - .reduce( - || BTreeSet::new(), - |mut set1, set2| { - set1.extend(set2); - set1 - }, - ); - - Self { - triples: joined_triples, - streams: self.streams.clone(), - sliding_window: self.sliding_window.clone(), - dictionary: Arc::clone(&self.dictionary), - prefixes: self.prefixes.clone(), - udfs: HashMap::new(), - index_manager: Some(self.index().clone_empty()), - rule_map: HashMap::new(), - cached_stats: None, - index_config: self.index_config.clone(), - } - } - - pub fn perform_join<'a>( - &self, - subject_var: &'a str, - predicate: &'a str, - object_var: &'a str, - triples: Vec, - dictionary: &'a Dictionary, - final_results: Vec>, - ) -> Vec> { - let mut new_results = Vec::new(); + if triples.len() >= 8192 { + // Process triples in parallel using Rayon + let local_triples: BTreeSet = triples.into_par_iter().collect(); + self.triples.extend(local_triples); + triples = Vec::with_capacity(8192); + } + } + + if !triples.is_empty() { + let local_triples: BTreeSet = triples.into_par_iter().collect(); + self.triples.extend(local_triples); + } + } + + // New parse_turtle function + pub fn parse_turtle(&mut self, turtle_data: &str) { + let lines = turtle_data.lines(); + + for line in lines { + let line = line.trim(); + + // Skip empty lines and comments + if line.is_empty() || line.starts_with("#") { + continue; + } + + // Parse triples + let parts: Vec<&str> = line.split_whitespace().collect(); + if parts.len() >= 3 { + let subject_raw = parts[0].trim_end_matches('.'); + let predicate_raw = parts[1].trim_end_matches('.'); + let object_raw = parts[2..].join(" ").trim_end_matches('.').to_string(); + + // Strip angle brackets from IRIs + let subject = if subject_raw.starts_with('<') && subject_raw.ends_with('>') { + subject_raw[1..subject_raw.len() - 1].to_string() + } else { + subject_raw.to_string() + }; + + let predicate = if predicate_raw.starts_with('<') && predicate_raw.ends_with('>') { + predicate_raw[1..predicate_raw.len() - 1].to_string() + } else { + predicate_raw.to_string() + }; - for triple in triples { - let subject = dictionary.decode(triple.subject).unwrap(); - let pred = dictionary.decode(triple.predicate).unwrap(); - let object = dictionary.decode(triple.object).unwrap(); + // Clean up object by removing quotes and angle brackets + let object = if object_raw.starts_with('<') && object_raw.ends_with('>') { + object_raw[1..object_raw.len() - 1].to_string() + } else if object_raw.starts_with('"') && object_raw.ends_with('"') { + object_raw[1..object_raw.len() - 1].to_string() + } else { + object_raw.trim().trim_matches('"').to_string() + }; + + let mut dict = self.dictionary.write().unwrap(); + let triple = Triple { + subject: dict.encode(&subject), + predicate: dict.encode(&predicate), + object: dict.encode(&object), + }; + drop(dict); + self.triples.insert(triple); + } else { + eprintln!("Skipping invalid line: {}", line); + } + } + } - if pred == predicate { - for result in &final_results { - let mut extended_result = result.clone(); - let mut valid_extension = true; + // New parse_n3 function + pub fn parse_n3(&mut self, n3_data: &str) { + let lines: Vec = n3_data.lines().map(|l| l.trim().to_string()).collect(); + let chunk_size = 1000; + let chunks: Vec> = lines.chunks(chunk_size).map(|c| c.to_vec()).collect(); + + let partial_results: Vec<( + BTreeSet, + Arc>, + HashMap, + )> = chunks + .par_iter() + .map(|chunk| { + let mut local_db = SparqlDatabase::new(); + let mut statement = String::new(); + + for raw_line in chunk { + let mut line = raw_line.as_str(); + if let Some(comment_start) = line.find('#') { + line = &line[..comment_start]; + line = line.trim(); + } + if line.is_empty() { + continue; + } + if line.starts_with("@prefix") { + let line = line.trim_start_matches("@prefix").trim_end_matches('.'); + let parts: Vec<&str> = line.split_whitespace().collect(); + if parts.len() >= 2 { + let prefix = parts[0].trim_end_matches(':').to_string(); + let uri = parts[1] + .trim_start_matches('<') + .trim_end_matches('>') + .to_string(); + local_db.prefixes.insert(prefix, uri); + } else { + eprintln!("Invalid prefix declaration: {}", line); + } + } else { + statement.push_str(line); + statement.push(' '); + if line.ends_with('.') { + local_db.parse_statement(statement.trim()); + statement.clear(); + } + } + } + + (local_db.triples, local_db.dictionary, local_db.prefixes) + }) + .collect(); + + for (triples, dict_arc, pref) in partial_results { + for t in triples { + self.triples.insert(t); + } + let mut self_dict = self.dictionary.write().unwrap(); + let other_dict = dict_arc.read().unwrap(); + self_dict.merge(&other_dict); + drop(other_dict); + drop(self_dict); + for (k, v) in pref { + self.prefixes.insert(k, v); + } + } + } + + // Parse_ntriples and add to DB function + pub fn parse_ntriples_and_add(&mut self, ntriples_data: &str) { + let partial_results = self.parse_ntriples(ntriples_data); + + let encoded_triples = self.encode_triples(partial_results); + for encoded_triple in encoded_triples { + self.add_triple(encoded_triple); + } + } + + // Parses ntriples + pub fn parse_ntriples(&mut self, ntriples_data: &str) -> Vec> { + let lines: Vec<&str> = ntriples_data.lines().collect(); + let chunk_size = 1000; + let chunks: Vec<&[&str]> = lines.chunks(chunk_size).collect(); + + let partial_results: Vec> = chunks + .par_iter() + .map(|chunk| { + let mut local_triples = Vec::new(); + + for line in chunk.iter() { + let line = line.trim(); + + // Skip empty lines and comments + if line.is_empty() || line.starts_with('#') { + continue; + } + + // N-Triples must end with a dot + if !line.ends_with('.') { + eprintln!("Invalid N-Triples line (missing dot): {}", line); + continue; + } + + // Remove the trailing dot + let line_without_dot = &line[..line.len() - 1].trim(); + + // Parse the triple + if let Some((subject, predicate, object)) = + self.parse_ntriples_line(line_without_dot) + { + local_triples.push((subject, predicate, object)); + } + } + + local_triples + }) + .collect(); + partial_results + } + + // Encode triples + pub fn encode_triples( + &mut self, + non_encoded_triples: Vec>, + ) -> Vec { + // Merge results with main dictionary + let mut encoded_triples = Vec::new(); + for triple_strings in non_encoded_triples { + for (subject, predicate, object) in triple_strings { + let mut dict = self.dictionary.write().unwrap(); + let main_triple = Triple { + subject: dict.encode(&subject), + predicate: dict.encode(&predicate), + object: dict.encode(&object), + }; + drop(dict); + encoded_triples.push(main_triple); + } + } + encoded_triples + } + + pub fn parse_and_encode_ntriples(&mut self, ntriples_data: &str) -> Vec { + let partial_results = self.parse_ntriples(ntriples_data); + + self.encode_triples(partial_results) + } + + // Helper method to parse a single N-Triples line + fn parse_ntriples_line(&self, line: &str) -> Option<(String, String, String)> { + let mut parts = Vec::new(); + let mut current_part = String::new(); + let mut in_uri = false; + let mut in_literal = false; + let mut escaped = false; + let mut chars = line.chars().peekable(); + + while let Some(ch) = chars.next() { + match ch { + '<' if !in_literal && !escaped => { + in_uri = true; + current_part.push(ch); + } + '>' if in_uri && !escaped => { + in_uri = false; + current_part.push(ch); + parts.push(current_part.trim().to_string()); + current_part.clear(); + } + '"' if !in_uri && !escaped => { + in_literal = !in_literal; + current_part.push(ch); + if !in_literal { + // Check for datatype or language tag after closing quote + while let Some(&next_ch) = chars.peek() { + if next_ch == '^' || next_ch == '@' { + current_part.push(chars.next().unwrap()); + // Handle ^^ for datatypes + if next_ch == '^' { + if let Some(&second_caret) = chars.peek() { + if second_caret == '^' { + current_part.push(chars.next().unwrap()); + // Now consume the datatype URI + while let Some(&datatype_ch) = chars.peek() { + if datatype_ch == '<' { + // Start of datatype URI + current_part.push(chars.next().unwrap()); + let mut in_datatype_uri = true; + while let Some(&uri_ch) = chars.peek() { + current_part.push(chars.next().unwrap()); + if uri_ch == '>' { + in_datatype_uri = false; + break; + } + } + if !in_datatype_uri { + break; + } + } else if datatype_ch.is_whitespace() { + break; + } else { + current_part.push(chars.next().unwrap()); + } + } + } + } + } else if next_ch == '@' { + // Language tag + while let Some(&lang_ch) = chars.peek() { + if lang_ch.is_alphanumeric() || lang_ch == '-' { + current_part.push(chars.next().unwrap()); + } else { + break; + } + } + } + break; + } else if next_ch.is_whitespace() { + break; + } else { + // Unexpected character after literal + break; + } + } + parts.push(current_part.trim().to_string()); + current_part.clear(); + } + } + '\\' if (in_uri || in_literal) && !escaped => { + escaped = true; + current_part.push(ch); + } + ' ' | '\t' if !in_uri && !in_literal && !escaped => { + if !current_part.is_empty() { + parts.push(current_part.trim().to_string()); + current_part.clear(); + } + } + _ => { + escaped = false; + current_part.push(ch); + } + } + } + + if !current_part.is_empty() { + parts.push(current_part.trim().to_string()); + } + + if parts.len() == 3 { + let subject = self.clean_ntriples_term(&parts[0]); + // Expand the Turtle `a` shorthand for rdf:type in predicate position. + let predicate = if parts[1] == "a" { + "http://www.w3.org/1999/02/22-rdf-syntax-ns#type".to_string() + } else { + self.clean_ntriples_term(&parts[1]) + }; + let object = self.clean_ntriples_term(&parts[2]); + Some((subject, predicate, object)) + } else { + eprintln!( + "Invalid N-Triples line (expected 3 parts, got {}): {}", + parts.len(), + line + ); + None + } + } + + // Helper method to clean N-Triples terms + fn clean_ntriples_term(&self, term: &str) -> String { + let term = term.trim(); + + // Handle URIs + if term.starts_with('<') && term.ends_with('>') { + return term[1..term.len() - 1].to_string(); + } + + // Handle literals (keep quotes and datatype/language info) + if term.starts_with('"') { + if let Some(close_quote_pos) = term[1..].find('"') { + let close_quote_pos = close_quote_pos + 1; + let literal_value = &term[1..close_quote_pos]; + let rest = &term[close_quote_pos + 1..]; + if rest.is_empty() { + return literal_value.to_string(); + } else if rest.starts_with("^^") { + return literal_value.to_string(); + } else if rest.starts_with("@") { + return format!("{}{}", literal_value, rest); + } + } + } + + // Return as-is for other cases + term.to_string() + } + + fn parse_statement(&mut self, statement: &str) { + let mut tokens = statement.split_whitespace().peekable(); + let mut subject = String::new(); + let mut predicate = String::new(); + let mut current_state = "subject"; + + while let Some(token) = tokens.next() { + match token { + ";" => { + predicate.clear(); + current_state = "predicate"; + } + "." => { + // End of statement + break; + } + _ => match current_state { + "subject" => { + subject = token.to_string(); + current_state = "predicate"; + } + "predicate" => { + predicate = token.to_string(); + current_state = "object"; + } + "object" => { + let mut object = token.to_string(); + + // Collect tokens until we reach ';', '.', or ',' + while let Some(next_token) = tokens.peek() { + if *next_token == ";" || *next_token == "." || *next_token == "," { + break; + } + // Consume the token + let next_token = tokens.next().unwrap(); + object.push(' '); + object.push_str(next_token); + } + + // Resolve terms and store the triple + let resolved_subject = self.resolve_term(&subject); + let resolved_predicate = self.resolve_term(&predicate); + let resolved_object = self.resolve_term(&object); + + let mut dict = self.dictionary.write().unwrap(); + let triple = Triple { + subject: dict.encode(&resolved_subject), + predicate: dict.encode(&resolved_predicate), + object: dict.encode(&resolved_object), + }; + drop(dict); + self.triples.insert(triple); + + current_state = "predicate"; + } + _ => {} + }, + } + } + } + + fn resolve_term(&self, term: &str) -> String { + if term.starts_with('<') && term.ends_with('>') { + term.trim_start_matches('<') + .trim_end_matches('>') + .to_string() + } else if term.starts_with('"') { + // It's a literal, possibly with a datatype or language tag + if let Some(pos) = term.rfind('"') { + let literal = &term[..=pos]; // Include the closing quote + let rest = &term[pos + 1..]; // After the closing quote + let mut result = literal.to_string(); + if rest.starts_with("^^") { + // It's a typed literal + let datatype = rest[2..].trim(); + let resolved_datatype = self.resolve_term(datatype); + result.push_str("^^"); + result.push_str(&resolved_datatype); + } else if rest.starts_with('@') { + // It's a language-tagged literal + result.push_str(rest); + } + result + } else { + // Malformed literal + term.to_string() + } + } else if term.contains(':') + && !term.starts_with("http://") + && !term.starts_with("https://") + { + let mut parts = term.splitn(2, ':'); + let prefix = parts.next().unwrap(); + let local_name = parts.next().unwrap_or(""); + if let Some(uri) = self.prefixes.get(prefix) { + format!("{}{}", uri, local_name) + } else { + eprintln!("Unknown prefix: {}", prefix); + term.to_string() + } + } else { + term.to_string() + } + } + + // Method to automatically extract and register prefixes from a query string + pub fn register_prefixes_from_query(&mut self, query: &str) { + // Simple regex to extract PREFIX declarations + let prefix_pattern = regex::Regex::new(r"PREFIX\s+([a-zA-Z0-9_]+):\s*<([^>]+)>").unwrap(); + + for captures in prefix_pattern.captures_iter(query) { + if captures.len() >= 3 { + let prefix = captures[1].to_string(); + let uri = captures[2].to_string(); + self.prefixes.insert(prefix, uri); + } + } + } + + // Method to ensure prefixes are properly shared between components + pub fn share_prefixes_with(&self, prefixes: &mut HashMap) { + for (prefix, uri) in &self.prefixes { + prefixes.insert(prefix.clone(), uri.clone()); + } + } + + pub fn resolve_query_term(&self, term: &str, prefixes: &HashMap) -> String { + if term.starts_with('<') && term.ends_with('>') { + term.trim_start_matches('<') + .trim_end_matches('>') + .to_string() + } else if term.starts_with('"') && term.ends_with('"') { + term.trim_matches('"').to_string() + } else if term.contains(':') + && !term.starts_with("http://") + && !term.starts_with("https://") + { + let mut parts = term.splitn(2, ':'); + let prefix = parts.next().unwrap(); + let local_name = parts.next().unwrap_or(""); + + // First check the passed prefixes map + if let Some(uri) = prefixes.get(prefix) { + format!("{}{}", uri, local_name) + } + // Then check the database's own prefixes map as a fallback + else if let Some(uri) = self.prefixes.get(prefix) { + format!("{}{}", uri, local_name) + } else { + eprintln!("Unknown prefix in query: {}", prefix); + term.to_string() + } + } else { + term.to_string() + } + } + + pub fn add_stream_data(&mut self, triple: Triple, timestamp: u64) { + self.streams.push(TimestampedTriple { triple, timestamp }); + } + + pub fn time_based_window(&self, start: u64, end: u64) -> BTreeSet { + self.streams + .iter() + .filter(|ts_triple| ts_triple.timestamp >= start && ts_triple.timestamp <= end) + .map(|ts_triple| ts_triple.triple.clone()) + .collect() + } + + pub fn apply_filters_simd<'a>( + &self, + results: Vec>, + filters: Vec>, + ) -> Vec> { + results + .into_iter() + .filter(|result| { + filters.iter().all(|filter_expr| { + match filter_expr { + FilterExpression::Comparison(var, operator, value) => { + // Check if either side contains arithmetic operations + let has_arithmetic = var.contains('+') + || var.contains('-') + || var.contains('*') + || var.contains('/') + || value.contains('+') + || value.contains('-') + || value.contains('*') + || value.contains('/'); + + if has_arithmetic { + // Use the non-SIMD arithmetic expression evaluator for complex expressions + let left_result = self.evaluate_arithmetic_string(result, var); + let right_result = self.evaluate_arithmetic_string(result, value); + + match (left_result, right_result) { + (Ok(left_val), Ok(right_val)) => { + // Both sides are numeric, perform comparison + match *operator { + "=" => left_val == right_val, + "!=" => left_val != right_val, + ">" => left_val > right_val, + ">=" => left_val >= right_val, + "<" => left_val < right_val, + "<=" => left_val <= right_val, + _ => false, + } + } + _ => false, // At least one expression couldn't be evaluated + } + } else { + // For simple expressions without arithmetic operators, use the SIMD approach + if let Some(var_value_str) = result.get(var) { + // First, try parsing both values as numbers + let var_value_num = var_value_str.parse::(); + let filter_value_num = value.parse::(); + + if var_value_num.is_ok() && filter_value_num.is_ok() { + // Both values are numeric, perform SIMD numeric comparison + let var_value = var_value_num.unwrap(); + let filter_value = filter_value_num.unwrap(); + + // On x86 (SSE2) or x86_64 (SSE2) use SIMD intrinsics + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + unsafe { + // Load values into SIMD registers + let var_simd = _mm_set1_epi32(var_value); + let filter_simd = _mm_set1_epi32(filter_value); + return match *operator { + "=" => { + _mm_movemask_epi8(_mm_cmpeq_epi32( + var_simd, + filter_simd, + )) == 0xFFFF + } + "!=" => { + _mm_movemask_epi8(_mm_cmpeq_epi32( + var_simd, + filter_simd, + )) != 0xFFFF + } + ">" => { + _mm_movemask_epi8(_mm_cmpgt_epi32( + var_simd, + filter_simd, + )) == 0xFFFF + } + ">=" => { + let eq = + _mm_cmpeq_epi32(var_simd, filter_simd); + let gt = + _mm_cmpgt_epi32(var_simd, filter_simd); + _mm_movemask_epi8(_mm_or_si128(eq, gt)) + == 0xFFFF + } + "<" => { + _mm_movemask_epi8(_mm_cmpgt_epi32( + filter_simd, + var_simd, + )) == 0xFFFF + } + "<=" => { + let eq = + _mm_cmpeq_epi32(var_simd, filter_simd); + let lt = + _mm_cmpgt_epi32(filter_simd, var_simd); + _mm_movemask_epi8(_mm_or_si128(eq, lt)) + == 0xFFFF + } + _ => false, + }; + } + } + + // On ARM (aarch64) use NEON intrinsics + #[cfg(target_arch = "aarch64")] + { + unsafe { + let var_neon = vdupq_n_s32(var_value); + let filter_neon = vdupq_n_s32(filter_value); + return match *operator { + "=" => { + let cmp = vceqq_s32(var_neon, filter_neon); + (vgetq_lane_u32(cmp, 0) == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 1) + == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 2) + == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 3) + == 0xFFFFFFFF) + } + "!=" => { + let cmp = vceqq_s32(var_neon, filter_neon); + !((vgetq_lane_u32(cmp, 0) == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 1) + == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 2) + == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 3) + == 0xFFFFFFFF)) + } + ">" => { + let cmp = vcgtq_s32(var_neon, filter_neon); + (vgetq_lane_u32(cmp, 0) == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 1) + == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 2) + == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 3) + == 0xFFFFFFFF) + } + ">=" => { + let eq = vceqq_s32(var_neon, filter_neon); + let gt = vcgtq_s32(var_neon, filter_neon); + let cmp = vorrq_u32(eq, gt); + (vgetq_lane_u32(cmp, 0) == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 1) + == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 2) + == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 3) + == 0xFFFFFFFF) + } + "<" => { + let cmp = vcgtq_s32(filter_neon, var_neon); + (vgetq_lane_u32(cmp, 0) == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 1) + == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 2) + == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 3) + == 0xFFFFFFFF) + } + "<=" => { + let eq = vceqq_s32(var_neon, filter_neon); + let lt = vcgtq_s32(filter_neon, var_neon); + let cmp = vorrq_u32(eq, lt); + (vgetq_lane_u32(cmp, 0) == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 1) + == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 2) + == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 3) + == 0xFFFFFFFF) + } + _ => false, + }; + } + } + + // Fallback (or if compiled for a non‐SIMD platform) + #[cfg(not(any( + target_arch = "x86", + target_arch = "x86_64", + target_arch = "aarch64" + )))] + { + return match *operator { + "=" => var_value == filter_value, + "!=" => var_value != filter_value, + ">" => var_value > filter_value, + ">=" => var_value >= filter_value, + "<" => var_value < filter_value, + "<=" => var_value <= filter_value, + _ => false, + }; + } + } else { + // At least one value is a string, perform string comparison + let var_bytes = var_value_str.as_bytes(); + let filter_bytes = value.as_bytes(); + + let var_len = var_bytes.len(); + let filter_len = filter_bytes.len(); + + // If lengths differ, they can't be equal + if var_len != filter_len { + return match *operator { + "=" => false, + "!=" => true, + _ => false, // Other operators are not supported for strings + }; + } + + let mut i = 0; + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + unsafe { + while i + 16 <= var_len { + let var_chunk = + _mm_loadu_si128(var_bytes[i..].as_ptr() + as *const __m128i); + let filter_chunk = + _mm_loadu_si128(filter_bytes[i..].as_ptr() + as *const __m128i); + let cmp = + _mm_cmpeq_epi8(var_chunk, filter_chunk); + let mask = _mm_movemask_epi8(cmp); + if mask != 0xFFFF { + return match *operator { + "=" => false, + "!=" => true, + _ => false, + }; + } + i += 16; + } + } + } + + #[cfg(target_arch = "aarch64")] + { + unsafe { + while i + 16 <= var_len { + let var_chunk = + vld1q_u8(var_bytes[i..].as_ptr()); + let filter_chunk = + vld1q_u8(filter_bytes[i..].as_ptr()); + let cmp = vceqq_u8(var_chunk, filter_chunk); + let cmp_arr: [u8; 16] = + std::mem::transmute(cmp); + if cmp_arr.iter().any(|&lane| lane != 0xFF) { + return match *operator { + "=" => false, + "!=" => true, + _ => false, + }; + } + i += 16; + } + } + } + + // Handle remaining bytes + if i < var_len { + for j in i..var_len { + if var_bytes[j] != filter_bytes[j] { + return match *operator { + "=" => false, + "!=" => true, + _ => false, + }; + } + } + } + + // Strings are equal + match *operator { + "=" => true, + "!=" => false, + _ => false, // Other operators not supported for strings + } + } + } else { + false + } + } + } + FilterExpression::And(left, right) => { + self.evaluate_filter_expression(result, left) + && self.evaluate_filter_expression(result, right) + } + FilterExpression::Or(left, right) => { + self.evaluate_filter_expression(result, left) + || self.evaluate_filter_expression(result, right) + } + FilterExpression::Not(expr) => { + !self.evaluate_filter_expression(result, expr) + } + FilterExpression::ArithmeticExpr(expr_str) => { + // True if it's non-zero + match self.evaluate_arithmetic_string(result, expr_str) { + Ok(val) => val != 0.0, + Err(_) => false, + } + } + } + }) + }) + .collect() + } + + // Helper function to evaluate an arithmetic expression + fn evaluate_arithmetic_expression<'a>( + &self, + result: &BTreeMap<&'a str, String>, + expr: &shared::query::ArithmeticExpression<'a>, + ) -> Result { + match expr { + shared::query::ArithmeticExpression::Operand(operand) => { + // Check if it's a variable + if operand.starts_with('?') { + if let Some(var_value) = result.get(*operand) { + // Parse the variable value as a number + var_value + .parse::() + .map_err(|_| format!("Cannot parse '{}' as a number", var_value)) + } else { + Err(format!("Variable '{}' not found", operand)) + } + } + // Check if it's a numeric literal + else if operand.chars().all(|c| c.is_digit(10) || c == '.') { + operand + .parse::() + .map_err(|_| format!("Cannot parse '{}' as a number", operand)) + } + // Check if it's a string literal + else if operand.starts_with('"') && operand.ends_with('"') { + Err(format!( + "Cannot perform arithmetic on string literal '{}'", + operand + )) + } + // Parse it as a number + else { + operand + .parse::() + .map_err(|_| format!("Cannot parse '{}' as a number", operand)) + } + } + shared::query::ArithmeticExpression::Add(left, right) => { + let left_val = self.evaluate_arithmetic_expression(result, left)?; + let right_val = self.evaluate_arithmetic_expression(result, right)?; + Ok(left_val + right_val) + } + shared::query::ArithmeticExpression::Subtract(left, right) => { + let left_val = self.evaluate_arithmetic_expression(result, left)?; + let right_val = self.evaluate_arithmetic_expression(result, right)?; + Ok(left_val - right_val) + } + shared::query::ArithmeticExpression::Multiply(left, right) => { + let left_val = self.evaluate_arithmetic_expression(result, left)?; + let right_val = self.evaluate_arithmetic_expression(result, right)?; + Ok(left_val * right_val) + } + shared::query::ArithmeticExpression::Divide(left, right) => { + let left_val = self.evaluate_arithmetic_expression(result, left)?; + let right_val = self.evaluate_arithmetic_expression(result, right)?; + if right_val == 0.0 { + Err("Division by zero".to_string()) + } else { + Ok(left_val / right_val) + } + } + } + } + + // Helper function to parse and evaluate an arithmetic expression from a string + fn evaluate_arithmetic_string<'a>( + &self, + result: &BTreeMap<&'a str, String>, + expr_str: &'a str, + ) -> Result { + // Check for parenthesized expressions and remove them if needed + let expr_to_parse = if expr_str.starts_with('(') && expr_str.ends_with(')') { + &expr_str[1..expr_str.len() - 1] + } else { + expr_str + }; + + if expr_to_parse.contains('+') + || expr_to_parse.contains('-') + || expr_to_parse.contains('*') + || expr_to_parse.contains('/') + { + // Parse the expression string into an ArithmeticExpression + match parser::parse_arithmetic_expression(expr_to_parse) { + Ok((_, arithmetic_expr)) => { + // Evaluate the parsed expression + self.evaluate_arithmetic_expression(result, &arithmetic_expr) + } + Err(e) => { + // Print the error + eprintln!( + "Failed to parse arithmetic expression '{}': {:?}", + expr_to_parse, e + ); + + // If parsing fails, try to treat it as a simple operand + if expr_to_parse.starts_with('?') { + // It's a variable + if let Some(var_value) = result.get(expr_to_parse) { + var_value + .parse::() + .map_err(|_| format!("Cannot parse '{}' as a number", var_value)) + } else { + Err(format!("Variable '{}' not found", expr_to_parse)) + } + } else { + // Parse as a number + expr_to_parse + .parse::() + .map_err(|_| format!("Cannot parse '{}' as a number", expr_to_parse)) + } + } + } + } else { + // No arithmetic operators, treat as simple operand + if expr_to_parse.starts_with('?') { + // It's a variable + if let Some(var_value) = result.get(expr_to_parse) { + var_value + .parse::() + .map_err(|_| format!("Cannot parse '{}' as a number", var_value)) + } else { + Err(format!("Variable '{}' not found", expr_to_parse)) + } + } else { + // Parse as a number + expr_to_parse + .parse::() + .map_err(|_| format!("Cannot parse '{}' as a number", expr_to_parse)) + } + } + } + + // Helper method to evaluate a filter expression against a result + fn evaluate_filter_expression<'a>( + &self, + result: &BTreeMap<&'a str, String>, + filter_expr: &FilterExpression<'a>, + ) -> bool { + match filter_expr { + FilterExpression::Comparison(left, operator, right) => { + // Evaluate both sides as arithmetic expressions + let left_result = self.evaluate_arithmetic_string(result, left); + let right_result = self.evaluate_arithmetic_string(result, right); + + match (left_result, right_result) { + (Ok(left_val), Ok(right_val)) => { + // Both sides are numeric, perform numeric comparison + match *operator { + "=" => left_val == right_val, + "!=" => left_val != right_val, + ">" => left_val > right_val, + ">=" => left_val >= right_val, + "<" => left_val < right_val, + "<=" => left_val <= right_val, + _ => false, + } + } + _ => { + let left_str = if left.starts_with('?') { + // Fix for the type mismatch error - convert to string + match result.get(left) { + Some(val) => val.as_str(), + None => left, + } + } else { + left + }; + + let right_str = if right.starts_with('?') { + // Fix for the type mismatch error - convert to string + match result.get(right) { + Some(val) => val.as_str(), + None => right, + } + } else { + right + }; + + match *operator { + "=" => left_str == right_str, + "!=" => left_str != right_str, + _ => false, // Other operators not supported for strings + } + } + } + } + FilterExpression::And(left, right) => { + self.evaluate_filter_expression(result, left) + && self.evaluate_filter_expression(result, right) + } + FilterExpression::Or(left, right) => { + self.evaluate_filter_expression(result, left) + || self.evaluate_filter_expression(result, right) + } + FilterExpression::Not(expr) => !self.evaluate_filter_expression(result, expr), + FilterExpression::ArithmeticExpr(expr_str) => { + // An arithmetic expression by itself is evaluated to true if it's non-zero + match self.evaluate_arithmetic_string(result, expr_str) { + Ok(val) => val != 0.0, + Err(_) => false, + } + } + } + } + + pub fn union(&mut self, other: &SparqlDatabase) -> Self { + // Create a new dictionary by cloning and merging + let self_dict = self.dictionary.read().unwrap(); + let other_dict = other.dictionary.read().unwrap(); + let mut merged_dictionary = self_dict.clone(); + drop(self_dict); + + // Re-encode triples from the other database using the merged dictionary + let mut re_encoded_triples = BTreeSet::new(); + for triple in &other.triples { + let subject = merged_dictionary.encode(other_dict.decode(triple.subject).unwrap()); + let predicate = merged_dictionary.encode(other_dict.decode(triple.predicate).unwrap()); + let object = merged_dictionary.encode(other_dict.decode(triple.object).unwrap()); + re_encoded_triples.insert(Triple { + subject, + predicate, + object, + }); + } + + // Merge the triples and streams + let union_triples: BTreeSet = + self.triples.union(&re_encoded_triples).cloned().collect(); + let mut union_streams = self.streams.clone(); + for ts_triple in &other.streams { + let subject = + merged_dictionary.encode(other_dict.decode(ts_triple.triple.subject).unwrap()); + let predicate = + merged_dictionary.encode(other_dict.decode(ts_triple.triple.predicate).unwrap()); + let object = + merged_dictionary.encode(other_dict.decode(ts_triple.triple.object).unwrap()); + let re_encoded_ts_triple = TimestampedTriple { + triple: Triple { + subject, + predicate, + object, + }, + timestamp: ts_triple.timestamp, + }; + if !union_streams.contains(&re_encoded_ts_triple) { + union_streams.push(re_encoded_ts_triple); + } + } + drop(other_dict); + + Self { + triples: union_triples, + streams: union_streams, + sliding_window: self.sliding_window.clone(), + dictionary: Arc::new(RwLock::new(merged_dictionary)), + prefixes: self.prefixes.clone(), + udfs: HashMap::new(), + index_manager: Some(self.index().clone_empty()), + rule_map: HashMap::new(), + cached_stats: None, + index_config: self.index_config.clone(), + } + } + + pub fn par_join(&mut self, other: &SparqlDatabase, predicate: &str) -> Self { + let mut dict = self.dictionary.write().unwrap(); + let predicate_id = dict.encode(predicate); + drop(dict); + let other_map: BTreeMap<&u32, Vec<&Triple>> = other + .triples + .par_iter() + .filter(|other_triple| other_triple.predicate == predicate_id) + .flat_map(|other_triple| { + vec![ + (&other_triple.subject, other_triple), + (&other_triple.object, other_triple), + ] + }) + .fold( + || BTreeMap::new(), + |mut acc, (key, triple)| { + acc.entry(key).or_insert_with(Vec::new).push(triple); + acc + }, + ) + .reduce( + || BTreeMap::new(), + |mut acc, map| { + for (key, triples) in map { + acc.entry(key).or_insert_with(Vec::new).extend(triples); + } + acc + }, + ); + + let joined_triples: BTreeSet = self + .triples + .par_iter() + .filter(|triple| triple.predicate == predicate_id) + .fold( + || BTreeSet::new(), + |mut local_set, triple| { + if let Some(matching_triples) = other_map.get(&triple.object) { + for other_triple in matching_triples { + local_set.insert(Triple { + subject: triple.subject, + predicate: other_triple.predicate, + object: other_triple.object, + }); + } + } + local_set + }, + ) + .reduce( + || BTreeSet::new(), + |mut set1, set2| { + set1.extend(set2); + set1 + }, + ); + + Self { + triples: joined_triples, + streams: self.streams.clone(), + sliding_window: self.sliding_window.clone(), + dictionary: Arc::clone(&self.dictionary), + prefixes: self.prefixes.clone(), + udfs: HashMap::new(), + index_manager: Some(self.index().clone_empty()), + rule_map: HashMap::new(), + cached_stats: None, + index_config: self.index_config.clone(), + } + } + + pub fn perform_join<'a>( + &self, + subject_var: &'a str, + predicate: &'a str, + object_var: &'a str, + triples: Vec, + dictionary: &'a Dictionary, + final_results: Vec>, + ) -> Vec> { + let mut new_results = Vec::new(); + + for triple in triples { + let subject = dictionary.decode(triple.subject).unwrap(); + let pred = dictionary.decode(triple.predicate).unwrap(); + let object = dictionary.decode(triple.object).unwrap(); + + if pred == predicate { + for result in &final_results { + let mut extended_result = result.clone(); + let mut valid_extension = true; + + // Check and extend the result with the subject + if let Some(existing_subject) = extended_result.get(subject_var) { + if existing_subject != &subject { + valid_extension = false; + } + } else { + extended_result.insert(subject_var, subject.to_string()); + } + + // Check and extend the result with the object + if let Some(existing_object) = extended_result.get(object_var) { + if existing_object != &object { + valid_extension = false; + } + } else { + extended_result.insert(object_var, object.to_string()); + } + + if valid_extension { + new_results.push(extended_result); + } + } + } + } + + new_results + } + + pub fn perform_join_par_simd_with_strict_filter_1<'a>( + &self, + subject_var: &'a str, + predicate: String, + object_var: &'a str, + triples: Vec, + dictionary: &Arc>, + final_results: Vec>, + literal_filter: Option, + ) -> Vec> { + if final_results.is_empty() { + return Vec::new(); + } + + let dictionary = dictionary.read().unwrap(); + + let predicate_bytes = predicate.as_bytes(); + let literal_filter_bytes = literal_filter.as_ref().map(|s| s.as_bytes()); + + // Partition final_results into groups based on variable bindings + let mut both_vars_bound: HashMap<(String, String), Vec>> = + HashMap::new(); + let mut subject_var_bound: HashMap>> = HashMap::new(); + let mut object_var_bound: HashMap>> = HashMap::new(); + let mut neither_var_bound: Vec> = Vec::new(); + + for result in final_results { + let subject_binding = result.get(subject_var).cloned(); + let object_binding = result.get(object_var).cloned(); + + match (subject_binding, object_binding) { + (Some(subj_val), Some(obj_val)) => { + both_vars_bound + .entry((subj_val.clone(), obj_val.clone())) + .or_default() + .push(result); + } + (Some(subj_val), None) => { + subject_var_bound + .entry(subj_val.clone()) + .or_default() + .push(result); + } + (None, Some(obj_val)) => { + object_var_bound + .entry(obj_val.clone()) + .or_default() + .push(result); + } + (None, None) => { + neither_var_bound.push(result); + } + } + } + + // Pre-allocate output vector + let results = Mutex::new(Vec::new()); + + // Using Rayon for parallel processing + triples.par_chunks(256).for_each(|chunk| { + let mut local_results = Vec::new(); + + for triple in chunk { + if let (Some(subject), Some(pred), Some(object)) = ( + dictionary.decode(triple.subject), + dictionary.decode(triple.predicate), + dictionary.decode(triple.object), + ) { + // SIMD predicate comparison + if pred.as_bytes() != predicate_bytes { + continue; + } + + // SIMD literal filter comparison + if let Some(filter_bytes) = literal_filter_bytes { + if object.as_bytes() != filter_bytes { + continue; + } + } + + // Process group both_vars_bound + { + let key = (subject.to_string(), object.to_string()); + if let Some(results_vec) = both_vars_bound.get(&key) { + for result in results_vec { + let extended_result = result.clone(); + local_results.push(extended_result); + } + } + } + + // Process group subject_var_bound + { + if let Some(results_vec) = subject_var_bound.get(subject) { + for result in results_vec { + let mut extended_result = result.clone(); + // Extend object_var + if let Some(existing_object) = extended_result.get(object_var) { + if existing_object != &object { + continue; // Inconsistent variable binding + } + } else { + extended_result.insert(object_var, object.to_string()); + } + local_results.push(extended_result); + } + } + } + + // Process group object_var_bound + { + if let Some(results_vec) = object_var_bound.get(object) { + for result in results_vec { + let mut extended_result = result.clone(); + // Extend subject_var + if let Some(existing_subject) = extended_result.get(subject_var) { + if existing_subject != &subject { + continue; // Inconsistent variable binding + } + } else { + extended_result.insert(subject_var, subject.to_string()); + } + local_results.push(extended_result); + } + } + } + + // Process group neither_var_bound + for result in &neither_var_bound { + let mut extended_result = result.clone(); + // Extend subject_var + if let Some(existing_subject) = extended_result.get(subject_var) { + if existing_subject != &subject { + continue; // Inconsistent variable binding + } + } else { + extended_result.insert(subject_var, subject.to_string()); + } + // Extend object_var + if let Some(existing_object) = extended_result.get(object_var) { + if existing_object != &object { + continue; // Inconsistent variable binding + } + } else { + extended_result.insert(object_var, object.to_string()); + } + local_results.push(extended_result); + } + } + } + + // Push local results to the shared results vector + let mut global_results = results.lock().unwrap(); + global_results.extend(local_results); + }); + + results.into_inner().unwrap() + } + + pub fn perform_join_par_simd_with_strict_filter_2<'a>( + &self, + subject_var: &'a str, + predicate: String, + object_var: &'a str, + triples: Vec, + dictionary: &'a Dictionary, + final_results: Vec>, + literal_filter: Option, + ) -> Vec> { + if final_results.is_empty() { + return Vec::new(); + } + + let predicate_bytes = predicate.as_bytes(); + let literal_filter_bytes = literal_filter.as_ref().map(|s| s.as_bytes()); + + // Partition final_results into groups based on variable bindings. + let mut both_vars_bound: HashMap<(String, String), Vec>> = + HashMap::new(); + let mut subject_var_bound: HashMap>> = HashMap::new(); + let mut object_var_bound: HashMap>> = HashMap::new(); + let mut neither_var_bound: Vec> = Vec::new(); + + for result in final_results { + let subject_binding = result.get(subject_var).cloned(); + let object_binding = result.get(object_var).cloned(); + + match (subject_binding, object_binding) { + (Some(subj_val), Some(obj_val)) => { + both_vars_bound + .entry((subj_val.clone(), obj_val.clone())) + .or_default() + .push(result); + } + (Some(subj_val), None) => { + subject_var_bound + .entry(subj_val.clone()) + .or_default() + .push(result); + } + (None, Some(obj_val)) => { + object_var_bound + .entry(obj_val.clone()) + .or_default() + .push(result); + } + (None, None) => { + neither_var_bound.push(result); + } + } + } + + // Pre-allocate output vector. + let results = Mutex::new(Vec::new()); + + // Using Rayon for parallel processing. + triples.par_chunks(256).for_each(|chunk| { + let mut local_results = Vec::new(); + + for triple in chunk { + if let (Some(subject), Some(pred), Some(object)) = ( + dictionary.decode(triple.subject), + dictionary.decode(triple.predicate), + dictionary.decode(triple.object), + ) { + // SIMD predicate comparison using simd_eq. + if !unsafe { simd_eq(pred.as_bytes(), predicate_bytes) } { + continue; + } + + // SIMD literal filter comparison. + if let Some(filter_bytes) = literal_filter_bytes { + if !unsafe { simd_eq(object.as_bytes(), filter_bytes) } { + continue; + } + } + + // Process group both_vars_bound. + { + let key = (subject.to_string(), object.to_string()); + if let Some(results_vec) = both_vars_bound.get(&key) { + for result in results_vec { + local_results.push(result.clone()); + } + } + } + + // Process group subject_var_bound. + { + if let Some(results_vec) = subject_var_bound.get(subject) { + for result in results_vec { + let mut extended_result = result.clone(); + // Extend object_var. + if let Some(existing_object) = extended_result.get(object_var) { + if existing_object != &object { + continue; // Inconsistent variable binding. + } + } else { + extended_result.insert(object_var, object.to_string()); + } + local_results.push(extended_result); + } + } + } + + // Process group object_var_bound. + { + if let Some(results_vec) = object_var_bound.get(object) { + for result in results_vec { + let mut extended_result = result.clone(); + // Extend subject_var. + if let Some(existing_subject) = extended_result.get(subject_var) { + if existing_subject != &subject { + continue; // Inconsistent variable binding. + } + } else { + extended_result.insert(subject_var, subject.to_string()); + } + local_results.push(extended_result); + } + } + } + + // Process group neither_var_bound. + for result in &neither_var_bound { + let mut extended_result = result.clone(); + // Extend subject_var. + if let Some(existing_subject) = extended_result.get(subject_var) { + if existing_subject != &subject { + continue; // Inconsistent variable binding. + } + } else { + extended_result.insert(subject_var, subject.to_string()); + } + // Extend object_var. + if let Some(existing_object) = extended_result.get(object_var) { + if existing_object != &object { + continue; // Inconsistent variable binding. + } + } else { + extended_result.insert(object_var, object.to_string()); + } + local_results.push(extended_result); + } + } + } + + // Push local results to the shared results vector. + let mut global_results = results.lock().unwrap(); + global_results.extend(local_results); + }); + + results.into_inner().unwrap() + } + + pub fn perform_join_sequential<'a>( + &self, + subject_var: &'a str, + predicate: String, + object_var: &'a str, + triples: Vec, + dictionary: &'a Dictionary, + final_results: Vec>, + literal_filter: Option, + ) -> Vec> { + if final_results.is_empty() { + return Vec::new(); + } + + let predicate_bytes = predicate.as_bytes(); + let literal_filter_bytes = literal_filter.as_ref().map(|s| s.as_bytes()); + + // Partition final_results into groups based on variable bindings. + let mut both_vars_bound: HashMap<(String, String), Vec>> = + HashMap::new(); + let mut subject_var_bound: HashMap>> = HashMap::new(); + let mut object_var_bound: HashMap>> = HashMap::new(); + let mut neither_var_bound: Vec> = Vec::new(); + + for result in final_results { + let subject_binding = result.get(subject_var).cloned(); + let object_binding = result.get(object_var).cloned(); + + match (subject_binding, object_binding) { + (Some(subj_val), Some(obj_val)) => { + both_vars_bound + .entry((subj_val.clone(), obj_val.clone())) + .or_default() + .push(result); + } + (Some(subj_val), None) => { + subject_var_bound + .entry(subj_val.clone()) + .or_default() + .push(result); + } + (None, Some(obj_val)) => { + object_var_bound + .entry(obj_val.clone()) + .or_default() + .push(result); + } + (None, None) => { + neither_var_bound.push(result); + } + } + } + + let mut results = Vec::new(); + + // Process triples sequentially. + for triple in triples { + if let (Some(subject), Some(pred), Some(object)) = ( + dictionary.decode(triple.subject), + dictionary.decode(triple.predicate), + dictionary.decode(triple.object), + ) { + // Check if the predicate matches. + if pred.as_bytes() != predicate_bytes { + continue; + } + + // Check the literal filter if provided. + if let Some(filter_bytes) = literal_filter_bytes { + if object.as_bytes() != filter_bytes { + continue; + } + } + + // Process group where both variables are already bound. + { + let key = (subject.to_string(), object.to_string()); + if let Some(results_vec) = both_vars_bound.get(&key) { + for result in results_vec { + results.push(result.clone()); + } + } + } + + // Process group where only subject_var is bound. + { + if let Some(results_vec) = subject_var_bound.get(subject) { + for result in results_vec { + let mut extended_result = result.clone(); + // Extend the object_var binding. + if let Some(existing_object) = extended_result.get(object_var) { + if existing_object != &object { + continue; // Inconsistent variable binding. + } + } else { + extended_result.insert(object_var, object.to_string()); + } + results.push(extended_result); + } + } + } + + // Process group where only object_var is bound. + { + if let Some(results_vec) = object_var_bound.get(object) { + for result in results_vec { + let mut extended_result = result.clone(); + // Extend the subject_var binding. + if let Some(existing_subject) = extended_result.get(subject_var) { + if existing_subject != &subject { + continue; // Inconsistent variable binding. + } + } else { + extended_result.insert(subject_var, subject.to_string()); + } + results.push(extended_result); + } + } + } + + // Process group where neither variable is bound. + for result in &neither_var_bound { + let mut extended_result = result.clone(); + // Extend the subject_var binding. + if let Some(existing_subject) = extended_result.get(subject_var) { + if existing_subject != &subject { + continue; // Inconsistent variable binding. + } + } else { + extended_result.insert(subject_var, subject.to_string()); + } + // Extend the object_var binding. + if let Some(existing_object) = extended_result.get(object_var) { + if existing_object != &object { + continue; // Inconsistent variable binding. + } + } else { + extended_result.insert(object_var, object.to_string()); + } + results.push(extended_result); + } + } + } + + results + } + + pub fn perform_join_sequential_simd<'a>( + &self, + subject_var: &'a str, + predicate: String, + object_var: &'a str, + triples: Vec, + dictionary: &'a Dictionary, + final_results: Vec>, + literal_filter: Option, + ) -> Vec> { + if final_results.is_empty() { + return Vec::new(); + } + + let predicate_bytes = predicate.as_bytes(); + let literal_filter_bytes = literal_filter.as_ref().map(|s| s.as_bytes()); + + // Partition final_results into groups based on variable bindings. + let mut both_vars_bound: HashMap<(String, String), Vec>> = + HashMap::new(); + let mut subject_var_bound: HashMap>> = HashMap::new(); + let mut object_var_bound: HashMap>> = HashMap::new(); + let mut neither_var_bound: Vec> = Vec::new(); + + for result in final_results { + let subject_binding = result.get(subject_var).cloned(); + let object_binding = result.get(object_var).cloned(); + + match (subject_binding, object_binding) { + (Some(subj_val), Some(obj_val)) => { + both_vars_bound + .entry((subj_val.clone(), obj_val.clone())) + .or_default() + .push(result); + } + (Some(subj_val), None) => { + subject_var_bound + .entry(subj_val.clone()) + .or_default() + .push(result); + } + (None, Some(obj_val)) => { + object_var_bound + .entry(obj_val.clone()) + .or_default() + .push(result); + } + (None, None) => { + neither_var_bound.push(result); + } + } + } + + let mut results = Vec::new(); + + // Process triples sequentially. + for triple in triples { + if let (Some(subject), Some(pred), Some(object)) = ( + dictionary.decode(triple.subject), + dictionary.decode(triple.predicate), + dictionary.decode(triple.object), + ) { + // Use SIMD-based comparison for the predicate. + if !simd_bytes_eq(pred.as_bytes(), predicate_bytes) { + continue; + } + + // Use SIMD-based comparison for the literal filter if provided. + if let Some(filter_bytes) = literal_filter_bytes { + if !simd_bytes_eq(object.as_bytes(), filter_bytes) { + continue; + } + } + + // Process group where both variables are already bound. + { + let key = (subject.to_string(), object.to_string()); + if let Some(results_vec) = both_vars_bound.get(&key) { + for result in results_vec { + results.push(result.clone()); + } + } + } + + // Process group where only subject_var is bound. + { + if let Some(results_vec) = subject_var_bound.get(subject) { + for result in results_vec { + let mut extended_result = result.clone(); + // Extend the object_var binding. + if let Some(existing_object) = extended_result.get(object_var) { + if existing_object != &object { + continue; // Inconsistent variable binding. + } + } else { + extended_result.insert(object_var, object.to_string()); + } + results.push(extended_result); + } + } + } + + // Process group where only object_var is bound. + { + if let Some(results_vec) = object_var_bound.get(object) { + for result in results_vec { + let mut extended_result = result.clone(); + // Extend the subject_var binding. + if let Some(existing_subject) = extended_result.get(subject_var) { + if existing_subject != &subject { + continue; // Inconsistent variable binding. + } + } else { + extended_result.insert(subject_var, subject.to_string()); + } + results.push(extended_result); + } + } + } + + // Process group where neither variable is bound. + for result in &neither_var_bound { + let mut extended_result = result.clone(); + // Extend the subject_var binding. + if let Some(existing_subject) = extended_result.get(subject_var) { + if existing_subject != &subject { + continue; // Inconsistent variable binding. + } + } else { + extended_result.insert(subject_var, subject.to_string()); + } + // Extend the object_var binding. + if let Some(existing_object) = extended_result.get(object_var) { + if existing_object != &object { + continue; // Inconsistent variable binding. + } + } else { + extended_result.insert(object_var, object.to_string()); + } + results.push(extended_result); + } + } + } + + results + } + + pub fn perform_join_par_simd_with_strict_filter_3<'a>( + &self, + subject_var: &'a str, + predicate: String, + object_var: &'a str, + triples: Vec, + dictionary: &'a Dictionary, + final_results: Vec>, + literal_filter: Option, + ) -> Vec> { + // Early return for empty joins + if final_results.is_empty() { + return Vec::new(); + } + + // Pre-fetch predicate and filter bytes to avoid string comparisons + let predicate_bytes = predicate.as_bytes(); + let literal_filter_bytes = literal_filter.as_ref().map(|s| s.as_bytes()); + + // Preallocate with capacity estimation to avoid rehashing + let estimated_capacity = (final_results.len() / 4).max(HASHMAP_INITIAL_CAPACITY); + + // Use with_capacity to preallocate hashmap space + let mut both_vars_bound: HashMap<(String, String), Vec> = + HashMap::with_capacity(estimated_capacity); + let mut subject_var_bound: HashMap> = + HashMap::with_capacity(estimated_capacity); + let mut object_var_bound: HashMap> = + HashMap::with_capacity(estimated_capacity); + let mut neither_var_bound: Vec = Vec::with_capacity(final_results.len() / 2); + + // Pre-compute and classify bindings - this is serial but much faster than doing it in parallel + for (idx, result) in final_results.iter().enumerate() { + let subject_binding = result.get(subject_var); + let object_binding = result.get(object_var); + + match (subject_binding, object_binding) { + (Some(subj_val), Some(obj_val)) => { + both_vars_bound + .entry((subj_val.clone(), obj_val.clone())) + .or_insert_with(|| Vec::with_capacity(4)) + .push(idx); + } + (Some(subj_val), None) => { + subject_var_bound + .entry(subj_val.clone()) + .or_insert_with(|| Vec::with_capacity(8)) + .push(idx); + } + (None, Some(obj_val)) => { + object_var_bound + .entry(obj_val.clone()) + .or_insert_with(|| Vec::with_capacity(8)) + .push(idx); + } + (None, None) => { + neither_var_bound.push(idx); + } + } + } + + // Immutable shared references for threading + let final_results_arc = Arc::new(final_results); + let both_vars_bound_arc = Arc::new(both_vars_bound); + let subject_var_bound_arc = Arc::new(subject_var_bound); + let object_var_bound_arc = Arc::new(object_var_bound); + let neither_var_bound_arc = Arc::new(neither_var_bound); + + // Calculate optimal chunk size based on available processors and dataset size + let chunk_size = (triples.len() / rayon::current_num_threads()).max(MIN_CHUNK_SIZE); + + // Process triples in chunks for better cache locality and load balancing + let results = triples + .par_chunks(chunk_size) + .flat_map(|triple_chunk| { + // Preallocate result vector for this chunk based on estimated hit rate + let mut local_results = Vec::with_capacity(triple_chunk.len() / 4); + + // Process each triple in the chunk + for triple in triple_chunk { + // Step 1: Quick predicate check first (early filter) + let pred_opt = dictionary.decode(triple.predicate); + if pred_opt.is_none() + || pred_opt.as_ref().unwrap().as_bytes() != predicate_bytes + { + continue; + } + + // Step 2: Filter check if needed + if let Some(filter_bytes) = &literal_filter_bytes { + let obj_opt = dictionary.decode(triple.object); + if obj_opt.is_none() + || obj_opt.as_ref().unwrap().as_bytes() != *filter_bytes + { + continue; + } + + // Decode subject only if predicate and object pass filters + if let Some(subj) = dictionary.decode(triple.subject) { + process_join( + &subj, + obj_opt.unwrap(), + subject_var, + object_var, + &both_vars_bound_arc, + &subject_var_bound_arc, + &object_var_bound_arc, + &neither_var_bound_arc, + &final_results_arc, + &mut local_results, + ); + } + } else { + // No filter - decode both subject and object + let subj_opt = dictionary.decode(triple.subject); + let obj_opt = dictionary.decode(triple.object); + + if let (Some(subj), Some(obj)) = (subj_opt, obj_opt) { + process_join( + &subj, + &obj, + subject_var, + object_var, + &both_vars_bound_arc, + &subject_var_bound_arc, + &object_var_bound_arc, + &neither_var_bound_arc, + &final_results_arc, + &mut local_results, + ); + } + } + } + + local_results + }) + .collect(); + + results + } + + pub fn perform_join_par_simd_with_strict_filter_4<'a>( + &self, + subject_var: &'a str, + predicate: String, + object_var: &'a str, + triples: Vec, + dictionary: &'a Dictionary, + final_results: Vec>, + literal_filter: Option, + ) -> Vec> { + // Early return for empty joins + if final_results.is_empty() { + return Vec::new(); + } + + // Pre-fetch predicate and filter bytes to avoid string comparisons + let predicate_bytes = predicate.as_bytes(); + let literal_filter_bytes = literal_filter.as_ref().map(|s| s.as_bytes()); + + let estimated_capacity = (final_results.len() / 3).max(HASHMAP_INITIAL_CAPACITY1); + + let mut both_vars_bound: HashMap<(String, String), Vec> = + HashMap::with_capacity(estimated_capacity / 2); // This tends to be smaller + let mut subject_var_bound: HashMap> = + HashMap::with_capacity(estimated_capacity); + let mut object_var_bound: HashMap> = + HashMap::with_capacity(estimated_capacity); + let mut neither_var_bound: Vec = Vec::with_capacity(final_results.len() / 2); + + // Pre-compute and classify bindings - this is serial but much faster than doing it in parallel + for (idx, result) in final_results.iter().enumerate() { + let subject_binding = result.get(subject_var); + let object_binding = result.get(object_var); + + match (subject_binding, object_binding) { + (Some(subj_val), Some(obj_val)) => { + both_vars_bound + .entry((subj_val.clone(), obj_val.clone())) + .or_insert_with(|| Vec::with_capacity(4)) + .push(idx); + } + (Some(subj_val), None) => { + subject_var_bound + .entry(subj_val.clone()) + .or_insert_with(|| Vec::with_capacity(8)) + .push(idx); + } + (None, Some(obj_val)) => { + object_var_bound + .entry(obj_val.clone()) + .or_insert_with(|| Vec::with_capacity(8)) + .push(idx); + } + (None, None) => { + neither_var_bound.push(idx); + } + } + } + + // Immutable shared references for threading + let final_results_arc = Arc::new(final_results); + let both_vars_bound_arc = Arc::new(both_vars_bound); + let subject_var_bound_arc = Arc::new(subject_var_bound); + let object_var_bound_arc = Arc::new(object_var_bound); + let neither_var_bound_arc = Arc::new(neither_var_bound); + + let chunk_size = + ((triples.len() / rayon::current_num_threads()) * 3 / 2).max(MIN_CHUNK_SIZE1); + + let results = triples + .par_chunks(chunk_size) + .fold( + || Vec::with_capacity(chunk_size / 4), // Local vector capacity based on chunk size + |mut local_results, triple_chunk| { + // Create a local result buffer + process_triple_chunk( + triple_chunk, + predicate_bytes, + &literal_filter_bytes, + subject_var, + object_var, + &both_vars_bound_arc, + &subject_var_bound_arc, + &object_var_bound_arc, + &neither_var_bound_arc, + &final_results_arc, + &mut local_results, + dictionary, + ); + + local_results + }, + ) + .reduce( + || Vec::new(), + |mut acc, mut chunk| { + if acc.is_empty() { + return chunk; + } + if chunk.is_empty() { + return acc; + } + + // Pre-allocate to avoid reallocation during append + if acc.capacity() < acc.len() + chunk.len() { + acc.reserve(chunk.len()); + } + acc.append(&mut chunk); + acc + }, + ); + + results + } + + pub fn istream(&self, last_timestamp: u64) -> Vec { + let mut new_triples = vec![]; + for ts_triple in &self.streams { + if ts_triple.timestamp > last_timestamp { + new_triples.push(ts_triple.triple.clone()); + } + } + new_triples + } + + pub fn dstream(&self, last_timestamp: u64, current_timestamp: u64) -> Vec { + let mut old_triples = BTreeSet::new(); + let mut current_triples = BTreeSet::new(); + + for ts_triple in &self.streams { + if ts_triple.timestamp <= last_timestamp { + old_triples.insert(ts_triple.triple.clone()); + } + if ts_triple.timestamp <= current_timestamp { + current_triples.insert(ts_triple.triple.clone()); + } + } + + old_triples.difference(¤t_triples).cloned().collect() + } + + pub fn rstream(&self, start: u64, end: u64) -> Vec { + let mut current_triples = BTreeSet::new(); + + for ts_triple in &self.streams { + if ts_triple.timestamp >= start && ts_triple.timestamp <= end { + current_triples.insert(ts_triple.triple.clone()); + } + } + + current_triples.into_iter().collect() + } + + pub fn set_sliding_window(&mut self, width: u64, slide: u64) { + self.sliding_window = Some(SlidingWindow::new(width, slide)); + } + + pub fn evaluate_sliding_window(&mut self) -> Vec { + if let Some(window) = &self.sliding_window { + let current_time = current_timestamp(); + let start_time = if current_time > window.width { + current_time - window.width + } else { + 0 + }; + + let result = self.rstream(start_time, current_time); + + // Update last evaluated time + self.sliding_window.as_mut().unwrap().last_evaluated = current_time; + + result + } else { + Vec::new() + } + } - // Check and extend the result with the subject - if let Some(existing_subject) = extended_result.get(subject_var) { - if existing_subject != &subject { - valid_extension = false; + pub fn window_close_policy(&mut self) -> Vec { + let mut result = vec![]; + if let Some(window) = &self.sliding_window { + let current_time = current_timestamp(); + if current_time >= window.last_evaluated + window.slide { + result = self.evaluate_sliding_window(); } - } else { - extended_result.insert(subject_var, subject.to_string()); - } + } + result + } - // Check and extend the result with the object - if let Some(existing_object) = extended_result.get(object_var) { - if existing_object != &object { - valid_extension = false; + pub fn content_change_policy(&mut self) -> Vec { + let mut _result = vec![]; + let initial_state: BTreeSet<_> = self.triples.clone(); + if let Some(_window) = &self.sliding_window { + _result = self.evaluate_sliding_window(); + let current_state: BTreeSet<_> = self.triples.clone(); + if initial_state != current_state { + return _result; } - } else { - extended_result.insert(object_var, object.to_string()); - } + } + vec![] + } - if valid_extension { - new_results.push(extended_result); - } + pub fn non_empty_content_policy(&mut self) -> Vec { + let result = self.evaluate_sliding_window(); + if !result.is_empty() { + return result; } - } + vec![] } - new_results - } + pub fn periodic_policy(&mut self, interval: std::time::Duration) -> Vec { + let mut result = vec![]; + if let Some(window) = &self.sliding_window { + let current_time = current_timestamp(); + if current_time >= window.last_evaluated + interval.as_secs() { + result = self.evaluate_sliding_window(); + } + } + result + } - pub fn perform_join_par_simd_with_strict_filter_1<'a>( - &self, - subject_var: &'a str, - predicate: String, - object_var: &'a str, - triples: Vec, - dictionary: &Arc>, - final_results: Vec>, - literal_filter: Option, - ) -> Vec> { - if final_results.is_empty() { - return Vec::new(); - } - - let dictionary = dictionary.read().unwrap(); - - let predicate_bytes = predicate.as_bytes(); - let literal_filter_bytes = literal_filter.as_ref().map(|s| s.as_bytes()); - - // Partition final_results into groups based on variable bindings - let mut both_vars_bound: HashMap<(String, String), Vec>> = - HashMap::new(); - let mut subject_var_bound: HashMap>> = HashMap::new(); - let mut object_var_bound: HashMap>> = HashMap::new(); - let mut neither_var_bound: Vec> = Vec::new(); - - for result in final_results { - let subject_binding = result.get(subject_var).cloned(); - let object_binding = result.get(object_var).cloned(); - - match (subject_binding, object_binding) { - (Some(subj_val), Some(obj_val)) => { - both_vars_bound - .entry((subj_val.clone(), obj_val.clone())) - .or_default() - .push(result); - } - (Some(subj_val), None) => { - subject_var_bound - .entry(subj_val.clone()) - .or_default() - .push(result); - } - (None, Some(obj_val)) => { - object_var_bound - .entry(obj_val.clone()) - .or_default() - .push(result); - } - (None, None) => { - neither_var_bound.push(result); - } - } - } - - // Pre-allocate output vector - let results = Mutex::new(Vec::new()); - - // Using Rayon for parallel processing - triples.par_chunks(256).for_each(|chunk| { - let mut local_results = Vec::new(); - - for triple in chunk { - if let (Some(subject), Some(pred), Some(object)) = ( - dictionary.decode(triple.subject), - dictionary.decode(triple.predicate), - dictionary.decode(triple.object), - ) { - // SIMD predicate comparison - if pred.as_bytes() != predicate_bytes { - continue; - } + pub fn auto_policy_evaluation(&mut self) -> Vec { + let current_time = current_timestamp(); + let mut result = vec![]; - // SIMD literal filter comparison - if let Some(filter_bytes) = literal_filter_bytes { - if object.as_bytes() != filter_bytes { - continue; + if let Some(window) = &self.sliding_window { + if current_time >= window.last_evaluated + window.slide { + println!("Window Close Policy"); + result.extend(self.evaluate_sliding_window()); } - } + } - // Process group both_vars_bound - { - let key = (subject.to_string(), object.to_string()); - if let Some(results_vec) = both_vars_bound.get(&key) { - for result in results_vec { - let extended_result = result.clone(); - local_results.push(extended_result); - } - } - } - - // Process group subject_var_bound - { - if let Some(results_vec) = subject_var_bound.get(subject) { - for result in results_vec { - let mut extended_result = result.clone(); - // Extend object_var - if let Some(existing_object) = extended_result.get(object_var) { - if existing_object != &object { - continue; // Inconsistent variable binding - } - } else { - extended_result.insert(object_var, object.to_string()); - } - local_results.push(extended_result); - } - } - } - - // Process group object_var_bound - { - if let Some(results_vec) = object_var_bound.get(object) { - for result in results_vec { - let mut extended_result = result.clone(); - // Extend subject_var - if let Some(existing_subject) = extended_result.get(subject_var) { - if existing_subject != &subject { - continue; // Inconsistent variable binding - } - } else { - extended_result.insert(subject_var, subject.to_string()); - } - local_results.push(extended_result); - } - } - } - - // Process group neither_var_bound - for result in &neither_var_bound { - let mut extended_result = result.clone(); - // Extend subject_var - if let Some(existing_subject) = extended_result.get(subject_var) { - if existing_subject != &subject { - continue; // Inconsistent variable binding - } - } else { - extended_result.insert(subject_var, subject.to_string()); - } - // Extend object_var - if let Some(existing_object) = extended_result.get(object_var) { - if existing_object != &object { - continue; // Inconsistent variable binding - } - } else { - extended_result.insert(object_var, object.to_string()); + let initial_state: BTreeSet<_> = self.triples.clone(); + if let Some(_window) = &self.sliding_window { + let current_state: BTreeSet<_> = self.triples.clone(); + if initial_state != current_state { + println!("Content Change Policy"); + result.extend(self.evaluate_sliding_window()); } - local_results.push(extended_result); - } } - } - // Push local results to the shared results vector - let mut global_results = results.lock().unwrap(); - global_results.extend(local_results); - }); + let non_empty_result = self.evaluate_sliding_window(); + if !non_empty_result.is_empty() { + println!("Non-empty Content Policy"); + result.extend(non_empty_result); + } - results.into_inner().unwrap() - } + let interval = std::time::Duration::new(5, 0); + if let Some(window) = &self.sliding_window { + if current_time >= window.last_evaluated + interval.as_secs() { + println!("Periodic Policy"); + result.extend(self.evaluate_sliding_window()); + } + } - pub fn perform_join_par_simd_with_strict_filter_2<'a>( - &self, - subject_var: &'a str, - predicate: String, - object_var: &'a str, - triples: Vec, - dictionary: &'a Dictionary, - final_results: Vec>, - literal_filter: Option, - ) -> Vec> { - if final_results.is_empty() { - return Vec::new(); + result } - let predicate_bytes = predicate.as_bytes(); - let literal_filter_bytes = literal_filter.as_ref().map(|s| s.as_bytes()); + pub fn handle_query(&mut self, query: &str) -> String { + // Assume the query string is in a basic format like "subject predicate object" + let parts: Vec<&str> = query.split_whitespace().collect(); - // Partition final_results into groups based on variable bindings. - let mut both_vars_bound: HashMap<(String, String), Vec>> = HashMap::new(); - let mut subject_var_bound: HashMap>> = HashMap::new(); - let mut object_var_bound: HashMap>> = HashMap::new(); - let mut neither_var_bound: Vec> = Vec::new(); + if parts.len() != 3 { + return "Invalid query format. Expected 'subject predicate object'.".to_string(); + } - for result in final_results { - let subject_binding = result.get(subject_var).cloned(); - let object_binding = result.get(object_var).cloned(); + let subject = parts[0]; + let predicate = parts[1]; + let object = parts[2]; - match (subject_binding, object_binding) { - (Some(subj_val), Some(obj_val)) => { - both_vars_bound - .entry((subj_val.clone(), obj_val.clone())) - .or_default() - .push(result); - } - (Some(subj_val), None) => { - subject_var_bound.entry(subj_val.clone()).or_default().push(result); - } - (None, Some(obj_val)) => { - object_var_bound.entry(obj_val.clone()).or_default().push(result); + let mut dict = self.dictionary.write().unwrap(); + let subject_id = dict.encode(subject); + let predicate_id = dict.encode(predicate); + let object_id = dict.encode(object); + + let mut result = String::new(); + for triple in &self.triples { + if triple.subject == subject_id + && triple.predicate == predicate_id + && triple.object == object_id + { + result.push_str(&format!( + "Subject: {}, Predicate: {}, Object: {}\n", + dict.decode(triple.subject).unwrap(), + dict.decode(triple.predicate).unwrap(), + dict.decode(triple.object).unwrap() + )); + } } - (None, None) => { - neither_var_bound.push(result); + drop(dict); + + if result.is_empty() { + result = "No matching triples found.".to_string(); } - } + + result } - // Pre-allocate output vector. - let results = Mutex::new(Vec::new()); + pub fn handle_update(&mut self, update: &str) -> String { + // Parse the SPARQL update and apply changes to the database + if update.starts_with("INSERT") { + // Extract the part between curly braces + if let Some(start) = update.find('{') { + if let Some(end) = update.find('}') { + let triple_str = &update[start + 1..end].trim(); + let parts: Vec<&str> = triple_str.split_whitespace().collect(); + + if parts.len() == 3 { + let subject = parts[0].to_string(); + let predicate = parts[1].to_string(); + let object = parts[2].to_string(); + + let mut dict = self.dictionary.write().unwrap(); + let triple = Triple { + subject: dict.encode(&subject), + predicate: dict.encode(&predicate), + object: dict.encode(&object), + }; + drop(dict); + self.triples.insert(triple); + return "Update Successful".to_string(); + } + } + } + } else if update.starts_with("DELETE") { + // Extract the part between curly braces + if let Some(start) = update.find('{') { + if let Some(end) = update.find('}') { + let triple_str = &update[start + 1..end].trim(); + let parts: Vec<&str> = triple_str.split_whitespace().collect(); + + if parts.len() == 3 { + let subject = parts[0].to_string(); + let predicate = parts[1].to_string(); + let object = parts[2].to_string(); + + let mut dict = self.dictionary.write().unwrap(); + let triple = Triple { + subject: dict.encode(&subject), + predicate: dict.encode(&predicate), + object: dict.encode(&object), + }; + drop(dict); + self.triples.remove(&triple); + return "Update Successful".to_string(); + } + } + } + } + "Update Failed".to_string() + } - // Using Rayon for parallel processing. - triples.par_chunks(256).for_each(|chunk| { - let mut local_results = Vec::new(); + pub fn handle_http_request(&mut self, request: &str) -> String { + let mut headers = [httparse::EMPTY_HEADER; 16]; + let mut req = httparse::Request::new(&mut headers); + req.parse(request.as_bytes()).unwrap(); - for triple in chunk { - if let (Some(subject), Some(pred), Some(object)) = ( - dictionary.decode(triple.subject), - dictionary.decode(triple.predicate), - dictionary.decode(triple.object), - ) { - // SIMD predicate comparison using simd_eq. - if !unsafe { simd_eq(pred.as_bytes(), predicate_bytes) } { - continue; - } - - // SIMD literal filter comparison. - if let Some(filter_bytes) = literal_filter_bytes { - if !unsafe { simd_eq(object.as_bytes(), filter_bytes) } { - continue; - } - } - - // Process group both_vars_bound. - { - let key = (subject.to_string(), object.to_string()); - if let Some(results_vec) = both_vars_bound.get(&key) { - for result in results_vec { - local_results.push(result.clone()); - } - } - } - - // Process group subject_var_bound. - { - if let Some(results_vec) = subject_var_bound.get(subject) { - for result in results_vec { - let mut extended_result = result.clone(); - // Extend object_var. - if let Some(existing_object) = extended_result.get(object_var) { - if existing_object != &object { - continue; // Inconsistent variable binding. - } - } else { - extended_result.insert(object_var, object.to_string()); + match req.method.unwrap() { + "GET" => { + let url = Url::parse(&("http://localhost".to_owned() + req.path.unwrap())).unwrap(); + let query_pairs: HashMap<_, _> = url.query_pairs().into_owned().collect(); + if let Some(query) = query_pairs.get("query") { + return self.handle_query(query); } - local_results.push(extended_result); - } - } - } - - // Process group object_var_bound. - { - if let Some(results_vec) = object_var_bound.get(object) { - for result in results_vec { - let mut extended_result = result.clone(); - // Extend subject_var. - if let Some(existing_subject) = extended_result.get(subject_var) { - if existing_subject != &subject { - continue; // Inconsistent variable binding. - } - } else { - extended_result.insert(subject_var, subject.to_string()); - } - local_results.push(extended_result); - } - } - } - - // Process group neither_var_bound. - for result in &neither_var_bound { - let mut extended_result = result.clone(); - // Extend subject_var. - if let Some(existing_subject) = extended_result.get(subject_var) { - if existing_subject != &subject { - continue; // Inconsistent variable binding. - } - } else { - extended_result.insert(subject_var, subject.to_string()); } - // Extend object_var. - if let Some(existing_object) = extended_result.get(object_var) { - if existing_object != &object { - continue; // Inconsistent variable binding. - } - } else { - extended_result.insert(object_var, object.to_string()); + "POST" => { + let content_type = req + .headers + .iter() + .find(|header| header.name.eq_ignore_ascii_case("Content-Type")) + .map(|header| header.value); + + if let Some(content_type) = content_type { + if content_type == b"application/sparql-query" { + // Direct POST query + if let Some(body) = request.split("\r\n\r\n").nth(1) { + return self.handle_query(body); + } + } else if content_type == b"application/x-www-form-urlencoded" { + // URL-encoded POST query or update + if let Some(body) = request.split("\r\n\r\n").nth(1) { + let body_decoded = + percent_decode(body.as_bytes()).decode_utf8().unwrap(); + let params: HashMap<_, _> = body_decoded + .split('&') + .map(|pair| { + let mut split = pair.split('='); + ( + split.next().unwrap().to_string(), + split.next().unwrap_or("").to_string(), + ) + }) + .collect(); + + if let Some(query) = params.get("query") { + return self.handle_query(query); + } else if let Some(update) = params.get("update") { + return self.handle_update(update); + } + } + } else if content_type == b"application/sparql-update" { + // Direct POST update + if let Some(body) = request.split("\r\n\r\n").nth(1) { + return self.handle_update(body); + } + } + } } - local_results.push(extended_result); - } + _ => {} } - } - // Push local results to the shared results vector. - let mut global_results = results.lock().unwrap(); - global_results.extend(local_results); - }); + "Bad Request".to_string() + } - results.into_inner().unwrap() - } + pub fn debug_print_triples(&self) { + let dict = self.dictionary.read().unwrap(); + for triple in &self.triples { + println!( + "Stored Triple -> Subject: {}, Predicate: {}, Object: {}", + dict.decode(triple.subject).unwrap(), + dict.decode(triple.predicate).unwrap(), + dict.decode(triple.object).unwrap() + ); + } + } - pub fn perform_join_sequential<'a>( - &self, - subject_var: &'a str, - predicate: String, - object_var: &'a str, - triples: Vec, - dictionary: &'a Dictionary, - final_results: Vec>, - literal_filter: Option, - ) -> Vec> { - if final_results.is_empty() { - return Vec::new(); - } - - let predicate_bytes = predicate.as_bytes(); - let literal_filter_bytes = literal_filter.as_ref().map(|s| s.as_bytes()); - - // Partition final_results into groups based on variable bindings. - let mut both_vars_bound: HashMap<(String, String), Vec>> = - HashMap::new(); - let mut subject_var_bound: HashMap>> = HashMap::new(); - let mut object_var_bound: HashMap>> = HashMap::new(); - let mut neither_var_bound: Vec> = Vec::new(); - - for result in final_results { - let subject_binding = result.get(subject_var).cloned(); - let object_binding = result.get(object_var).cloned(); - - match (subject_binding, object_binding) { - (Some(subj_val), Some(obj_val)) => { - both_vars_bound - .entry((subj_val.clone(), obj_val.clone())) - .or_default() - .push(result); - } - (Some(subj_val), None) => { - subject_var_bound - .entry(subj_val.clone()) - .or_default() - .push(result); - } - (None, Some(obj_val)) => { - object_var_bound - .entry(obj_val.clone()) - .or_default() - .push(result); - } - (None, None) => { - neither_var_bound.push(result); - } - } - } - - let mut results = Vec::new(); - - // Process triples sequentially. - for triple in triples { - if let (Some(subject), Some(pred), Some(object)) = ( - dictionary.decode(triple.subject), - dictionary.decode(triple.predicate), - dictionary.decode(triple.object), - ) { - // Check if the predicate matches. - if pred.as_bytes() != predicate_bytes { - continue; - } - - // Check the literal filter if provided. - if let Some(filter_bytes) = literal_filter_bytes { - if object.as_bytes() != filter_bytes { - continue; - } + #[cfg(feature = "cuda")] + pub fn perform_hash_join_cuda_wrapper<'a>( + &self, + subject_var: &'a str, + predicate: String, + object_var: &'a str, + triples: Vec, + dictionary: &'a Dictionary, + final_results: Vec>, + literal_filter: Option, + ) -> Vec> { + if final_results.is_empty() { + return Vec::new(); } - // Process group where both variables are already bound. - { - let key = (subject.to_string(), object.to_string()); - if let Some(results_vec) = both_vars_bound.get(&key) { - for result in results_vec { - results.push(result.clone()); + // Prepare data for CUDA + let subjects: Vec = triples.iter().map(|t| t.subject).collect(); + let predicates: Vec = triples.iter().map(|t| t.predicate).collect(); + let objects: Vec = triples.iter().map(|t| t.object).collect(); + + let predicate_filter = dictionary.clone().encode(&predicate); + + let literal_filter_value = literal_filter + .as_ref() + .map(|lit| dictionary.clone().encode(lit)) + .unwrap_or(0); + + let literal_filter_option = if literal_filter.is_some() { + Some(literal_filter_value) + } else { + None + }; + + // Call CUDA function + let matching_indices = hash_join_cuda( + &subjects, + &predicates, + &objects, + predicate_filter, + literal_filter_option, + ); + + // Prepare variable bindings + let mut both_vars_bound: HashMap<(String, String), Vec>> = + HashMap::new(); + let mut subject_var_bound: HashMap>> = HashMap::new(); + let mut object_var_bound: HashMap>> = HashMap::new(); + let mut neither_var_bound: Vec> = Vec::new(); + + for result in final_results { + let subject_binding = result.get(subject_var).cloned(); + let object_binding = result.get(object_var).cloned(); + + match (subject_binding, object_binding) { + (Some(subj_val), Some(obj_val)) => { + both_vars_bound + .entry((subj_val.clone(), obj_val.clone())) + .or_default() + .push(result); + } + (Some(subj_val), None) => { + subject_var_bound + .entry(subj_val.clone()) + .or_default() + .push(result); + } + (None, Some(obj_val)) => { + object_var_bound + .entry(obj_val.clone()) + .or_default() + .push(result); + } + (None, None) => { + neither_var_bound.push(result); + } } - } } - // Process group where only subject_var is bound. - { - if let Some(results_vec) = subject_var_bound.get(subject) { - for result in results_vec { - let mut extended_result = result.clone(); - // Extend the object_var binding. - if let Some(existing_object) = extended_result.get(object_var) { - if existing_object != &object { - continue; // Inconsistent variable binding. - } - } else { - extended_result.insert(object_var, object.to_string()); - } - results.push(extended_result); + // Reconstruct results + let mut results = Vec::new(); + + for idx in matching_indices { + let triple = &triples[idx as usize]; + + if let (Some(subject), Some(object)) = ( + dictionary.decode(triple.subject), + dictionary.decode(triple.object), + ) { + // Process group both_vars_bound + { + let key = (subject.to_string(), object.to_string()); + if let Some(results_vec) = both_vars_bound.get(&key) { + for result in results_vec { + let extended_result = result.clone(); + results.push(extended_result); + } + } + } + + // Process group subject_var_bound + { + if let Some(results_vec) = subject_var_bound.get(subject) { + for result in results_vec { + let mut extended_result = result.clone(); + // Extend object_var + if let Some(existing_object) = extended_result.get(object_var) { + if existing_object != &object { + continue; // Inconsistent variable binding + } + } else { + extended_result.insert(object_var, object.to_string()); + } + results.push(extended_result); + } + } + } + + // Process group object_var_bound + { + if let Some(results_vec) = object_var_bound.get(object) { + for result in results_vec { + let mut extended_result = result.clone(); + // Extend subject_var + if let Some(existing_subject) = extended_result.get(subject_var) { + if existing_subject != &subject { + continue; // Inconsistent variable binding + } + } else { + extended_result.insert(subject_var, subject.to_string()); + } + results.push(extended_result); + } + } + } + + // Process group neither_var_bound + for result in &neither_var_bound { + let mut extended_result = result.clone(); + // Extend subject_var + if let Some(existing_subject) = extended_result.get(subject_var) { + if existing_subject != &subject { + continue; // Inconsistent variable binding + } + } else { + extended_result.insert(subject_var, subject.to_string()); + } + // Extend object_var + if let Some(existing_object) = extended_result.get(object_var) { + if existing_object != &object { + continue; // Inconsistent variable binding + } + } else { + extended_result.insert(object_var, object.to_string()); + } + results.push(extended_result); + } } - } } - // Process group where only object_var is bound. - { - if let Some(results_vec) = object_var_bound.get(object) { - for result in results_vec { - let mut extended_result = result.clone(); - // Extend the subject_var binding. - if let Some(existing_subject) = extended_result.get(subject_var) { - if existing_subject != &subject { - continue; // Inconsistent variable binding. - } - } else { - extended_result.insert(subject_var, subject.to_string()); - } - results.push(extended_result); + results + } + + // Create user defined function + pub fn register_udf(&mut self, name: &str, f: F) + where + F: Fn(Vec<&str>) -> String + Send + Sync + 'static, + { + self.udfs.insert(name.to_string(), ClonableFn::new(f)); + } + + /// Triple to string + pub fn triple_to_string(&self, triple: &Triple, dict: &Dictionary) -> String { + let subject = dict.decode(triple.subject); + let predicate = dict.decode(triple.predicate); + let object = dict.decode(triple.object); + format!( + "{} {} {}", + subject.unwrap(), + predicate.unwrap(), + object.unwrap() + ) + } + + pub fn decode_triple(&self, triple: &Triple) -> Option<(String, String, String)> { + let dict = self.dictionary.read().unwrap(); + let subject = dict.decode(triple.subject)?.to_string(); + let predicate = dict.decode(triple.predicate)?.to_string(); + let object = dict.decode(triple.object)?.to_string(); + drop(dict); + + Some((subject, predicate, object)) + } +} + +#[cfg_attr( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature(enable = "sse2") +)] +#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon"))] +pub unsafe fn simd_eq(a: &[u8], b: &[u8]) -> bool { + if a.len() != b.len() { + return false; + } + + // SSE2 implementation for x86/x86_64 + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + let len = a.len(); + let chunks = len / 16; + let mut i = 0; + while i < chunks * 16 { + let pa = a.as_ptr().add(i) as *const __m128i; + let pb = b.as_ptr().add(i) as *const __m128i; + let va = _mm_loadu_si128(pa); + let vb = _mm_loadu_si128(pb); + let cmp = _mm_cmpeq_epi8(va, vb); + let mask = _mm_movemask_epi8(cmp); + if mask != 0xFFFF { + return false; + } + i += 16; + } + // Compare any remaining bytes + for j in (chunks * 16)..len { + if a[j] != b[j] { + return false; } - } } + return true; + } - // Process group where neither variable is bound. - for result in &neither_var_bound { - let mut extended_result = result.clone(); - // Extend the subject_var binding. - if let Some(existing_subject) = extended_result.get(subject_var) { - if existing_subject != &subject { - continue; // Inconsistent variable binding. + // NEON implementation for aarch64 + #[cfg(target_arch = "aarch64")] + { + let len = a.len(); + let chunks = len / 16; + let mut i = 0; + while i < chunks * 16 { + let pa = a.as_ptr().add(i); + let pb = b.as_ptr().add(i); + let va = vld1q_u8(pa); + let vb = vld1q_u8(pb); + let cmp = vceqq_u8(va, vb); + let cmp_u64 = vreinterpretq_u64_u8(cmp); + let low = vgetq_lane_u64(cmp_u64, 0); + let high = vgetq_lane_u64(cmp_u64, 1); + if low != u64::MAX || high != u64::MAX { + return false; } - } else { - extended_result.insert(subject_var, subject.to_string()); - } - // Extend the object_var binding. - if let Some(existing_object) = extended_result.get(object_var) { - if existing_object != &object { - continue; // Inconsistent variable binding. + i += 16; + } + // Compare any remaining bytes + for j in (chunks * 16)..len { + if a[j] != b[j] { + return false; } - } else { - extended_result.insert(object_var, object.to_string()); - } - results.push(extended_result); } - } + return true; } - results - } + // Fallback for other architectures + #[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))] + { + return a == b; + } +} - pub fn perform_join_sequential_simd<'a>( - &self, - subject_var: &'a str, - predicate: String, - object_var: &'a str, - triples: Vec, - dictionary: &'a Dictionary, - final_results: Vec>, - literal_filter: Option, - ) -> Vec> { - if final_results.is_empty() { - return Vec::new(); - } - - let predicate_bytes = predicate.as_bytes(); - let literal_filter_bytes = literal_filter.as_ref().map(|s| s.as_bytes()); - - // Partition final_results into groups based on variable bindings. - let mut both_vars_bound: HashMap<(String, String), Vec>> = - HashMap::new(); - let mut subject_var_bound: HashMap>> = HashMap::new(); - let mut object_var_bound: HashMap>> = HashMap::new(); - let mut neither_var_bound: Vec> = Vec::new(); - - for result in final_results { - let subject_binding = result.get(subject_var).cloned(); - let object_binding = result.get(object_var).cloned(); - - match (subject_binding, object_binding) { - (Some(subj_val), Some(obj_val)) => { - both_vars_bound - .entry((subj_val.clone(), obj_val.clone())) - .or_default() - .push(result); - } - (Some(subj_val), None) => { - subject_var_bound - .entry(subj_val.clone()) - .or_default() - .push(result); - } - (None, Some(obj_val)) => { - object_var_bound - .entry(obj_val.clone()) - .or_default() - .push(result); - } - (None, None) => { - neither_var_bound.push(result); - } - } - } - - let mut results = Vec::new(); - - // Process triples sequentially. - for triple in triples { - if let (Some(subject), Some(pred), Some(object)) = ( - dictionary.decode(triple.subject), - dictionary.decode(triple.predicate), - dictionary.decode(triple.object), - ) { - // Use SIMD-based comparison for the predicate. - if !simd_bytes_eq(pred.as_bytes(), predicate_bytes) { - continue; - } - - // Use SIMD-based comparison for the literal filter if provided. - if let Some(filter_bytes) = literal_filter_bytes { - if !simd_bytes_eq(object.as_bytes(), filter_bytes) { - continue; - } +#[inline] +fn simd_bytes_eq(a: &[u8], b: &[u8]) -> bool { + if a.len() != b.len() { + return false; + } + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + unsafe { + use std::arch::x86_64::*; + let mut i = 0; + let len = a.len(); + while i + 16 <= len { + let a_chunk = _mm_loadu_si128(a.as_ptr().add(i) as *const __m128i); + let b_chunk = _mm_loadu_si128(b.as_ptr().add(i) as *const __m128i); + let cmp = _mm_cmpeq_epi8(a_chunk, b_chunk); + // If all 16 bytes match, _mm_movemask_epi8 returns 0xFFFF. + if _mm_movemask_epi8(cmp) != 0xFFFF { + return false; + } + i += 16; } - - // Process group where both variables are already bound. - { - let key = (subject.to_string(), object.to_string()); - if let Some(results_vec) = both_vars_bound.get(&key) { - for result in results_vec { - results.push(result.clone()); + // Compare any remaining bytes. + for j in i..len { + if a[j] != b[j] { + return false; } - } } + true + } + #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] + { + // Fallback on non-x86 architectures. + a == b + } +} - // Process group where only subject_var is bound. - { - if let Some(results_vec) = subject_var_bound.get(subject) { - for result in results_vec { - let mut extended_result = result.clone(); - // Extend the object_var binding. - if let Some(existing_object) = extended_result.get(object_var) { - if existing_object != &object { - continue; // Inconsistent variable binding. - } - } else { +#[inline(always)] +fn process_join<'a>( + subject: &str, + object: &str, + subject_var: &'a str, + object_var: &'a str, + both_vars_bound: &Arc>>, + subject_var_bound: &Arc>>, + object_var_bound: &Arc>>, + neither_var_bound: &Arc>, + final_results_arc: &Arc>>, + local_results: &mut Vec>, +) { + // Check both_vars_bound - most restrictive case first + if let Some(result_indices) = both_vars_bound.get(&(subject.to_string(), object.to_string())) { + for &idx in result_indices { + local_results.push(final_results_arc[idx].clone()); + } + } + + // Process subject_var_bound + if let Some(result_indices) = subject_var_bound.get(subject) { + for &idx in result_indices { + let base_result = &final_results_arc[idx]; + // Check for object consistency if it exists + if let Some(existing_object) = base_result.get(object_var) { + if existing_object == object { + local_results.push(base_result.clone()); + } + } else { + // Bind the object variable + let mut extended_result = base_result.clone(); extended_result.insert(object_var, object.to_string()); - } - results.push(extended_result); + local_results.push(extended_result); } - } } + } - // Process group where only object_var is bound. - { - if let Some(results_vec) = object_var_bound.get(object) { - for result in results_vec { - let mut extended_result = result.clone(); - // Extend the subject_var binding. - if let Some(existing_subject) = extended_result.get(subject_var) { - if existing_subject != &subject { - continue; // Inconsistent variable binding. - } - } else { + // Process object_var_bound + if let Some(result_indices) = object_var_bound.get(object) { + for &idx in result_indices { + let base_result = &final_results_arc[idx]; + // Check for subject consistency if it exists + if let Some(existing_subject) = base_result.get(subject_var) { + if existing_subject == subject { + local_results.push(base_result.clone()); + } + } else { + // Bind the subject variable + let mut extended_result = base_result.clone(); extended_result.insert(subject_var, subject.to_string()); - } - results.push(extended_result); + local_results.push(extended_result); } - } } + } + + // Process neither_var_bound - least restrictive case last + for &idx in neither_var_bound.iter() { + let base_result = &final_results_arc[idx]; + + // Check both consistency constraints + let subject_consistent = base_result + .get(subject_var) + .map_or(true, |existing| existing == subject); + let object_consistent = base_result + .get(object_var) + .map_or(true, |existing| existing == object); + + if subject_consistent && object_consistent { + let mut extended_result = base_result.clone(); - // Process group where neither variable is bound. - for result in &neither_var_bound { - let mut extended_result = result.clone(); - // Extend the subject_var binding. - if let Some(existing_subject) = extended_result.get(subject_var) { - if existing_subject != &subject { - continue; // Inconsistent variable binding. + // Only insert if not already present + if !base_result.contains_key(subject_var) { + extended_result.insert(subject_var, subject.to_string()); } - } else { - extended_result.insert(subject_var, subject.to_string()); - } - // Extend the object_var binding. - if let Some(existing_object) = extended_result.get(object_var) { - if existing_object != &object { - continue; // Inconsistent variable binding. + if !base_result.contains_key(object_var) { + extended_result.insert(object_var, object.to_string()); } - } else { - extended_result.insert(object_var, object.to_string()); - } - results.push(extended_result); + + local_results.push(extended_result); } - } } +} - results - } - - pub fn perform_join_par_simd_with_strict_filter_3<'a>( - &self, +#[inline(always)] +fn process_triple_chunk<'a>( + triple_chunk: &[Triple], + predicate_bytes: &[u8], + literal_filter_bytes: &Option<&[u8]>, subject_var: &'a str, - predicate: String, object_var: &'a str, - triples: Vec, + both_vars_bound: &Arc>>, + subject_var_bound: &Arc>>, + object_var_bound: &Arc>>, + neither_var_bound: &Arc>, + final_results_arc: &Arc>>, + local_results: &mut Vec>, dictionary: &'a Dictionary, - final_results: Vec>, - literal_filter: Option, - ) -> Vec> { - // Early return for empty joins - if final_results.is_empty() { - return Vec::new(); - } - - // Pre-fetch predicate and filter bytes to avoid string comparisons - let predicate_bytes = predicate.as_bytes(); - let literal_filter_bytes = literal_filter.as_ref().map(|s| s.as_bytes()); - - // Preallocate with capacity estimation to avoid rehashing - let estimated_capacity = (final_results.len() / 4).max(HASHMAP_INITIAL_CAPACITY); - - // Use with_capacity to preallocate hashmap space - let mut both_vars_bound: HashMap<(String, String), Vec> = - HashMap::with_capacity(estimated_capacity); - let mut subject_var_bound: HashMap> = - HashMap::with_capacity(estimated_capacity); - let mut object_var_bound: HashMap> = - HashMap::with_capacity(estimated_capacity); - let mut neither_var_bound: Vec = Vec::with_capacity(final_results.len() / 2); - - // Pre-compute and classify bindings - this is serial but much faster than doing it in parallel - for (idx, result) in final_results.iter().enumerate() { - let subject_binding = result.get(subject_var); - let object_binding = result.get(object_var); - - match (subject_binding, object_binding) { - (Some(subj_val), Some(obj_val)) => { - both_vars_bound - .entry((subj_val.clone(), obj_val.clone())) - .or_insert_with(|| Vec::with_capacity(4)) - .push(idx); - } - (Some(subj_val), None) => { - subject_var_bound - .entry(subj_val.clone()) - .or_insert_with(|| Vec::with_capacity(8)) - .push(idx); - } - (None, Some(obj_val)) => { - object_var_bound - .entry(obj_val.clone()) - .or_insert_with(|| Vec::with_capacity(8)) - .push(idx); - } - (None, None) => { - neither_var_bound.push(idx); - } - } - } - - // Immutable shared references for threading - let final_results_arc = Arc::new(final_results); - let both_vars_bound_arc = Arc::new(both_vars_bound); - let subject_var_bound_arc = Arc::new(subject_var_bound); - let object_var_bound_arc = Arc::new(object_var_bound); - let neither_var_bound_arc = Arc::new(neither_var_bound); - - // Calculate optimal chunk size based on available processors and dataset size - let chunk_size = (triples.len() / rayon::current_num_threads()).max(MIN_CHUNK_SIZE); - - // Process triples in chunks for better cache locality and load balancing - let results = triples - .par_chunks(chunk_size) - .flat_map(|triple_chunk| { - // Preallocate result vector for this chunk based on estimated hit rate - let mut local_results = Vec::with_capacity(triple_chunk.len() / 4); - - // Process each triple in the chunk - for triple in triple_chunk { - // Step 1: Quick predicate check first (early filter) - let pred_opt = dictionary.decode(triple.predicate); - if pred_opt.is_none() || pred_opt.as_ref().unwrap().as_bytes() != predicate_bytes { +) { + // Pre-filter triples to avoid unnecessary decoding + for triple in triple_chunk { + let pred_opt = dictionary.decode(triple.predicate); + if pred_opt.is_none() || pred_opt.as_ref().unwrap().as_bytes() != predicate_bytes { continue; - } + } - // Step 2: Filter check if needed - if let Some(filter_bytes) = &literal_filter_bytes { + if let Some(filter_bytes) = literal_filter_bytes { let obj_opt = dictionary.decode(triple.object); if obj_opt.is_none() || obj_opt.as_ref().unwrap().as_bytes() != *filter_bytes { - continue; + continue; } - // Decode subject only if predicate and object pass filters if let Some(subj) = dictionary.decode(triple.subject) { - process_join( - &subj, - obj_opt.unwrap(), - subject_var, - object_var, - &both_vars_bound_arc, - &subject_var_bound_arc, - &object_var_bound_arc, - &neither_var_bound_arc, - &final_results_arc, - &mut local_results, - ); - } - } else { - // No filter - decode both subject and object + process_join_efficiently( + &subj, + obj_opt.unwrap(), + subject_var, + object_var, + both_vars_bound, + subject_var_bound, + object_var_bound, + neither_var_bound, + final_results_arc, + local_results, + ); + } + } else { let subj_opt = dictionary.decode(triple.subject); let obj_opt = dictionary.decode(triple.object); if let (Some(subj), Some(obj)) = (subj_opt, obj_opt) { - process_join( - &subj, - &obj, - subject_var, - object_var, - &both_vars_bound_arc, - &subject_var_bound_arc, - &object_var_bound_arc, - &neither_var_bound_arc, - &final_results_arc, - &mut local_results, - ); - } - } - } - - local_results - }) - .collect(); - - results - } - - pub fn perform_join_par_simd_with_strict_filter_4<'a>( - &self, - subject_var: &'a str, - predicate: String, - object_var: &'a str, - triples: Vec, - dictionary: &'a Dictionary, - final_results: Vec>, - literal_filter: Option, - ) -> Vec> { - // Early return for empty joins - if final_results.is_empty() { - return Vec::new(); - } - - // Pre-fetch predicate and filter bytes to avoid string comparisons - let predicate_bytes = predicate.as_bytes(); - let literal_filter_bytes = literal_filter.as_ref().map(|s| s.as_bytes()); - - let estimated_capacity = (final_results.len() / 3).max(HASHMAP_INITIAL_CAPACITY1); - - let mut both_vars_bound: HashMap<(String, String), Vec> = - HashMap::with_capacity(estimated_capacity / 2); // This tends to be smaller - let mut subject_var_bound: HashMap> = - HashMap::with_capacity(estimated_capacity); - let mut object_var_bound: HashMap> = - HashMap::with_capacity(estimated_capacity); - let mut neither_var_bound: Vec = Vec::with_capacity(final_results.len() / 2); - - // Pre-compute and classify bindings - this is serial but much faster than doing it in parallel - for (idx, result) in final_results.iter().enumerate() { - let subject_binding = result.get(subject_var); - let object_binding = result.get(object_var); - - match (subject_binding, object_binding) { - (Some(subj_val), Some(obj_val)) => { - both_vars_bound - .entry((subj_val.clone(), obj_val.clone())) - .or_insert_with(|| Vec::with_capacity(4)) - .push(idx); - } - (Some(subj_val), None) => { - subject_var_bound - .entry(subj_val.clone()) - .or_insert_with(|| Vec::with_capacity(8)) - .push(idx); - } - (None, Some(obj_val)) => { - object_var_bound - .entry(obj_val.clone()) - .or_insert_with(|| Vec::with_capacity(8)) - .push(idx); - } - (None, None) => { - neither_var_bound.push(idx); - } - } - } - - // Immutable shared references for threading - let final_results_arc = Arc::new(final_results); - let both_vars_bound_arc = Arc::new(both_vars_bound); - let subject_var_bound_arc = Arc::new(subject_var_bound); - let object_var_bound_arc = Arc::new(object_var_bound); - let neither_var_bound_arc = Arc::new(neither_var_bound); - - let chunk_size = ((triples.len() / rayon::current_num_threads()) * 3 / 2).max(MIN_CHUNK_SIZE1); - - let results = triples - .par_chunks(chunk_size) - .fold( - || Vec::with_capacity(chunk_size / 4), // Local vector capacity based on chunk size - |mut local_results, triple_chunk| { - // Create a local result buffer - process_triple_chunk( - triple_chunk, - predicate_bytes, - &literal_filter_bytes, - subject_var, - object_var, - &both_vars_bound_arc, - &subject_var_bound_arc, - &object_var_bound_arc, - &neither_var_bound_arc, - &final_results_arc, - &mut local_results, - dictionary, - ); - - local_results - }, - ) - .reduce( - || Vec::new(), - |mut acc, mut chunk| { - if acc.is_empty() { - return chunk; - } - if chunk.is_empty() { - return acc; - } - - // Pre-allocate to avoid reallocation during append - if acc.capacity() < acc.len() + chunk.len() { - acc.reserve(chunk.len()); - } - acc.append(&mut chunk); - acc - }, - ); - - results - } - - pub fn istream(&self, last_timestamp: u64) -> Vec { - let mut new_triples = vec![]; - for ts_triple in &self.streams { - if ts_triple.timestamp > last_timestamp { - new_triples.push(ts_triple.triple.clone()); - } - } - new_triples - } - - pub fn dstream(&self, last_timestamp: u64, current_timestamp: u64) -> Vec { - let mut old_triples = BTreeSet::new(); - let mut current_triples = BTreeSet::new(); - - for ts_triple in &self.streams { - if ts_triple.timestamp <= last_timestamp { - old_triples.insert(ts_triple.triple.clone()); - } - if ts_triple.timestamp <= current_timestamp { - current_triples.insert(ts_triple.triple.clone()); - } - } - - old_triples.difference(¤t_triples).cloned().collect() - } - - pub fn rstream(&self, start: u64, end: u64) -> Vec { - let mut current_triples = BTreeSet::new(); - - for ts_triple in &self.streams { - if ts_triple.timestamp >= start && ts_triple.timestamp <= end { - current_triples.insert(ts_triple.triple.clone()); - } - } - - current_triples.into_iter().collect() - } - - pub fn set_sliding_window(&mut self, width: u64, slide: u64) { - self.sliding_window = Some(SlidingWindow::new(width, slide)); - } - - pub fn evaluate_sliding_window(&mut self) -> Vec { - if let Some(window) = &self.sliding_window { - let current_time = current_timestamp(); - let start_time = if current_time > window.width { - current_time - window.width - } else { - 0 - }; - - let result = self.rstream(start_time, current_time); - - // Update last evaluated time - self.sliding_window.as_mut().unwrap().last_evaluated = current_time; - - result - } else { - Vec::new() - } - } - - pub fn window_close_policy(&mut self) -> Vec { - let mut result = vec![]; - if let Some(window) = &self.sliding_window { - let current_time = current_timestamp(); - if current_time >= window.last_evaluated + window.slide { - result = self.evaluate_sliding_window(); - } - } - result - } - - pub fn content_change_policy(&mut self) -> Vec { - let mut _result = vec![]; - let initial_state: BTreeSet<_> = self.triples.clone(); - if let Some(_window) = &self.sliding_window { - _result = self.evaluate_sliding_window(); - let current_state: BTreeSet<_> = self.triples.clone(); - if initial_state != current_state { - return _result; - } - } - vec![] - } - - pub fn non_empty_content_policy(&mut self) -> Vec { - let result = self.evaluate_sliding_window(); - if !result.is_empty() { - return result; - } - vec![] - } - - pub fn periodic_policy(&mut self, interval: std::time::Duration) -> Vec { - let mut result = vec![]; - if let Some(window) = &self.sliding_window { - let current_time = current_timestamp(); - if current_time >= window.last_evaluated + interval.as_secs() { - result = self.evaluate_sliding_window(); - } - } - result - } - - pub fn auto_policy_evaluation(&mut self) -> Vec { - let current_time = current_timestamp(); - let mut result = vec![]; - - if let Some(window) = &self.sliding_window { - if current_time >= window.last_evaluated + window.slide { - println!("Window Close Policy"); - result.extend(self.evaluate_sliding_window()); - } - } - - let initial_state: BTreeSet<_> = self.triples.clone(); - if let Some(_window) = &self.sliding_window { - let current_state: BTreeSet<_> = self.triples.clone(); - if initial_state != current_state { - println!("Content Change Policy"); - result.extend(self.evaluate_sliding_window()); - } - } - - let non_empty_result = self.evaluate_sliding_window(); - if !non_empty_result.is_empty() { - println!("Non-empty Content Policy"); - result.extend(non_empty_result); - } - - let interval = std::time::Duration::new(5, 0); - if let Some(window) = &self.sliding_window { - if current_time >= window.last_evaluated + interval.as_secs() { - println!("Periodic Policy"); - result.extend(self.evaluate_sliding_window()); - } - } - - result - } - - pub fn handle_query(&mut self, query: &str) -> String { - // Assume the query string is in a basic format like "subject predicate object" - let parts: Vec<&str> = query.split_whitespace().collect(); - - if parts.len() != 3 { - return "Invalid query format. Expected 'subject predicate object'.".to_string(); - } - - let subject = parts[0]; - let predicate = parts[1]; - let object = parts[2]; - - let mut dict = self.dictionary.write().unwrap(); - let subject_id = dict.encode(subject); - let predicate_id = dict.encode(predicate); - let object_id = dict.encode(object); - - let mut result = String::new(); - for triple in &self.triples { - if triple.subject == subject_id - && triple.predicate == predicate_id - && triple.object == object_id - { - result.push_str(&format!( - "Subject: {}, Predicate: {}, Object: {}\n", - dict.decode(triple.subject).unwrap(), - dict.decode(triple.predicate).unwrap(), - dict.decode(triple.object).unwrap() - )); - } - } - drop(dict); - - if result.is_empty() { - result = "No matching triples found.".to_string(); - } - - result - } - - pub fn handle_update(&mut self, update: &str) -> String { - // Parse the SPARQL update and apply changes to the database - if update.starts_with("INSERT") { - // Extract the part between curly braces - if let Some(start) = update.find('{') { - if let Some(end) = update.find('}') { - let triple_str = &update[start + 1..end].trim(); - let parts: Vec<&str> = triple_str.split_whitespace().collect(); - - if parts.len() == 3 { - let subject = parts[0].to_string(); - let predicate = parts[1].to_string(); - let object = parts[2].to_string(); - - let mut dict = self.dictionary.write().unwrap(); - let triple = Triple { - subject: dict.encode(&subject), - predicate: dict.encode(&predicate), - object: dict.encode(&object), - }; - drop(dict); - self.triples.insert(triple); - return "Update Successful".to_string(); - } - } - } - } else if update.starts_with("DELETE") { - // Extract the part between curly braces - if let Some(start) = update.find('{') { - if let Some(end) = update.find('}') { - let triple_str = &update[start + 1..end].trim(); - let parts: Vec<&str> = triple_str.split_whitespace().collect(); - - if parts.len() == 3 { - let subject = parts[0].to_string(); - let predicate = parts[1].to_string(); - let object = parts[2].to_string(); - - let mut dict = self.dictionary.write().unwrap(); - let triple = Triple { - subject: dict.encode(&subject), - predicate: dict.encode(&predicate), - object: dict.encode(&object), - }; - drop(dict); - self.triples.remove(&triple); - return "Update Successful".to_string(); - } - } - } - } - "Update Failed".to_string() - } - - pub fn handle_http_request(&mut self, request: &str) -> String { - let mut headers = [httparse::EMPTY_HEADER; 16]; - let mut req = httparse::Request::new(&mut headers); - req.parse(request.as_bytes()).unwrap(); - - match req.method.unwrap() { - "GET" => { - let url = Url::parse(&("http://localhost".to_owned() + req.path.unwrap())).unwrap(); - let query_pairs: HashMap<_, _> = url.query_pairs().into_owned().collect(); - if let Some(query) = query_pairs.get("query") { - return self.handle_query(query); - } - } - "POST" => { - let content_type = req - .headers - .iter() - .find(|header| header.name.eq_ignore_ascii_case("Content-Type")) - .map(|header| header.value); - - if let Some(content_type) = content_type { - if content_type == b"application/sparql-query" { - // Direct POST query - if let Some(body) = request.split("\r\n\r\n").nth(1) { - return self.handle_query(body); - } - } else if content_type == b"application/x-www-form-urlencoded" { - // URL-encoded POST query or update - if let Some(body) = request.split("\r\n\r\n").nth(1) { - let body_decoded = - percent_decode(body.as_bytes()).decode_utf8().unwrap(); - let params: HashMap<_, _> = body_decoded - .split('&') - .map(|pair| { - let mut split = pair.split('='); - ( - split.next().unwrap().to_string(), - split.next().unwrap_or("").to_string(), - ) - }) - .collect(); - - if let Some(query) = params.get("query") { - return self.handle_query(query); - } else if let Some(update) = params.get("update") { - return self.handle_update(update); - } + process_join_efficiently( + &subj, + &obj, + subject_var, + object_var, + both_vars_bound, + subject_var_bound, + object_var_bound, + neither_var_bound, + final_results_arc, + local_results, + ); } - } else if content_type == b"application/sparql-update" { - // Direct POST update - if let Some(body) = request.split("\r\n\r\n").nth(1) { - return self.handle_update(body); - } - } } - } - _ => {} - } - - "Bad Request".to_string() - } - - pub fn debug_print_triples(&self) { - let dict = self.dictionary.read().unwrap(); - for triple in &self.triples { - println!( - "Stored Triple -> Subject: {}, Predicate: {}, Object: {}", - dict.decode(triple.subject).unwrap(), - dict.decode(triple.predicate).unwrap(), - dict.decode(triple.object).unwrap() - ); } - } +} - #[cfg(feature = "cuda")] - pub fn perform_hash_join_cuda_wrapper<'a>( - &self, +#[inline(always)] +fn process_join_efficiently<'a>( + subject: &str, + object: &str, subject_var: &'a str, - predicate: String, object_var: &'a str, - triples: Vec, - dictionary: &'a Dictionary, - final_results: Vec>, - literal_filter: Option, - ) -> Vec> { - if final_results.is_empty() { - return Vec::new(); - } - - // Prepare data for CUDA - let subjects: Vec = triples.iter().map(|t| t.subject).collect(); - let predicates: Vec = triples.iter().map(|t| t.predicate).collect(); - let objects: Vec = triples.iter().map(|t| t.object).collect(); - - let predicate_filter = dictionary.clone().encode(&predicate); - - let literal_filter_value = literal_filter - .as_ref() - .map(|lit| dictionary.clone().encode(lit)) - .unwrap_or(0); - - let literal_filter_option = if literal_filter.is_some() { - Some(literal_filter_value) - } else { - None - }; - - // Call CUDA function - let matching_indices = hash_join_cuda( - &subjects, - &predicates, - &objects, - predicate_filter, - literal_filter_option, - ); - - // Prepare variable bindings - let mut both_vars_bound: HashMap<(String, String), Vec>> = - HashMap::new(); - let mut subject_var_bound: HashMap>> = HashMap::new(); - let mut object_var_bound: HashMap>> = HashMap::new(); - let mut neither_var_bound: Vec> = Vec::new(); - - for result in final_results { - let subject_binding = result.get(subject_var).cloned(); - let object_binding = result.get(object_var).cloned(); - - match (subject_binding, object_binding) { - (Some(subj_val), Some(obj_val)) => { - both_vars_bound - .entry((subj_val.clone(), obj_val.clone())) - .or_default() - .push(result); - } - (Some(subj_val), None) => { - subject_var_bound - .entry(subj_val.clone()) - .or_default() - .push(result); - } - (None, Some(obj_val)) => { - object_var_bound - .entry(obj_val.clone()) - .or_default() - .push(result); - } - (None, None) => { - neither_var_bound.push(result); - } - } - } - - // Reconstruct results - let mut results = Vec::new(); - - for idx in matching_indices { - let triple = &triples[idx as usize]; - - if let (Some(subject), Some(object)) = ( - dictionary.decode(triple.subject), - dictionary.decode(triple.object), - ) { - // Process group both_vars_bound - { - let key = (subject.to_string(), object.to_string()); - if let Some(results_vec) = both_vars_bound.get(&key) { - for result in results_vec { - let extended_result = result.clone(); - results.push(extended_result); - } - } - } - - // Process group subject_var_bound - { - if let Some(results_vec) = subject_var_bound.get(subject) { - for result in results_vec { - let mut extended_result = result.clone(); - // Extend object_var - if let Some(existing_object) = extended_result.get(object_var) { - if existing_object != &object { - continue; // Inconsistent variable binding - } - } else { + both_vars_bound: &Arc>>, + subject_var_bound: &Arc>>, + object_var_bound: &Arc>>, + neither_var_bound: &Arc>, + final_results_arc: &Arc>>, + local_results: &mut Vec>, +) { + if let Some(result_indices) = both_vars_bound.get(&(subject.to_string(), object.to_string())) { + for &idx in result_indices { + // Clone efficiently with pre-allocation + let result = final_results_arc[idx].clone(); + local_results.push(result); + } + return; // Early return after handling the most restrictive case + } + + // Check for subject var bound - second most restrictive + if let Some(result_indices) = subject_var_bound.get(subject) { + for &idx in result_indices { + let base_result = &final_results_arc[idx]; + // Check for object consistency if it exists + if let Some(existing_object) = base_result.get(object_var) { + if existing_object == object { + local_results.push(base_result.clone()); + } + } else { + let mut extended_result = base_result.clone(); extended_result.insert(object_var, object.to_string()); - } - results.push(extended_result); + local_results.push(extended_result); } - } } + } - // Process group object_var_bound - { - if let Some(results_vec) = object_var_bound.get(object) { - for result in results_vec { - let mut extended_result = result.clone(); - // Extend subject_var - if let Some(existing_subject) = extended_result.get(subject_var) { - if existing_subject != &subject { - continue; // Inconsistent variable binding - } - } else { + // Check for object var bound + if let Some(result_indices) = object_var_bound.get(object) { + for &idx in result_indices { + let base_result = &final_results_arc[idx]; + if let Some(existing_subject) = base_result.get(subject_var) { + if existing_subject == subject { + local_results.push(base_result.clone()); + } + } else { + let mut extended_result = base_result.clone(); extended_result.insert(subject_var, subject.to_string()); - } - results.push(extended_result); - } - } - } - - // Process group neither_var_bound - for result in &neither_var_bound { - let mut extended_result = result.clone(); - // Extend subject_var - if let Some(existing_subject) = extended_result.get(subject_var) { - if existing_subject != &subject { - continue; // Inconsistent variable binding - } - } else { - extended_result.insert(subject_var, subject.to_string()); - } - // Extend object_var - if let Some(existing_object) = extended_result.get(object_var) { - if existing_object != &object { - continue; // Inconsistent variable binding - } - } else { - extended_result.insert(object_var, object.to_string()); - } - results.push(extended_result); - } - } - } - - results - } - - // Create user defined function - pub fn register_udf(&mut self, name: &str, f: F) -where - F: Fn(Vec<&str>) -> String + Send + Sync + 'static, - { - self.udfs.insert(name.to_string(), ClonableFn::new(f)); - } - - /// Triple to string - pub fn triple_to_string(&self, triple: &Triple, dict: &Dictionary) -> String { - let subject = dict.decode(triple.subject); - let predicate = dict.decode(triple.predicate); - let object = dict.decode(triple.object); - format!("{} {} {}", subject.unwrap(), predicate.unwrap(), object.unwrap()) - } - - pub fn decode_triple(&self, triple: &Triple) -> Option<(String, String, String)> { - let dict = self.dictionary.read().unwrap(); - let subject = dict.decode(triple.subject)?.to_string(); - let predicate = dict.decode(triple.predicate)?.to_string(); - let object = dict.decode(triple.object)?.to_string(); - drop(dict); - - Some((subject, predicate, object)) - } -} - -#[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"), target_feature(enable = "sse2"))] -#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon"))] -pub unsafe fn simd_eq(a: &[u8], b: &[u8]) -> bool { - if a.len() != b.len() { - return false; - } - - // SSE2 implementation for x86/x86_64 - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - { - let len = a.len(); - let chunks = len / 16; - let mut i = 0; - while i < chunks * 16 { - let pa = a.as_ptr().add(i) as *const __m128i; - let pb = b.as_ptr().add(i) as *const __m128i; - let va = _mm_loadu_si128(pa); - let vb = _mm_loadu_si128(pb); - let cmp = _mm_cmpeq_epi8(va, vb); - let mask = _mm_movemask_epi8(cmp); - if mask != 0xFFFF { - return false; - } - i += 16; - } - // Compare any remaining bytes - for j in (chunks * 16)..len { - if a[j] != b[j] { - return false; - } - } - return true; - } - - // NEON implementation for aarch64 - #[cfg(target_arch = "aarch64")] - { - let len = a.len(); - let chunks = len / 16; - let mut i = 0; - while i < chunks * 16 { - let pa = a.as_ptr().add(i); - let pb = b.as_ptr().add(i); - let va = vld1q_u8(pa); - let vb = vld1q_u8(pb); - let cmp = vceqq_u8(va, vb); - let cmp_u64 = vreinterpretq_u64_u8(cmp); - let low = vgetq_lane_u64(cmp_u64, 0); - let high = vgetq_lane_u64(cmp_u64, 1); - if low != u64::MAX || high != u64::MAX { - return false; - } - i += 16; - } - // Compare any remaining bytes - for j in (chunks * 16)..len { - if a[j] != b[j] { - return false; - } + local_results.push(extended_result); + } + } } - return true; - } - - // Fallback for other architectures - #[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))] - { - return a == b; - } -} -#[inline] -fn simd_bytes_eq(a: &[u8], b: &[u8]) -> bool { - if a.len() != b.len() { - return false; - } - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - unsafe { - use std::arch::x86_64::*; - let mut i = 0; - let len = a.len(); - while i + 16 <= len { - let a_chunk = _mm_loadu_si128(a.as_ptr().add(i) as *const __m128i); - let b_chunk = _mm_loadu_si128(b.as_ptr().add(i) as *const __m128i); - let cmp = _mm_cmpeq_epi8(a_chunk, b_chunk); - // If all 16 bytes match, _mm_movemask_epi8 returns 0xFFFF. - if _mm_movemask_epi8(cmp) != 0xFFFF { - return false; - } - i += 16; - } - // Compare any remaining bytes. - for j in i..len { - if a[j] != b[j] { - return false; - } - } - true - } - #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] - { - // Fallback on non-x86 architectures. - a == b - } -} + // Process least restrictive case - neither var bound + for &idx in neither_var_bound.iter() { + let base_result = &final_results_arc[idx]; -#[inline(always)] -fn process_join<'a>( - subject: &str, - object: &str, - subject_var: &'a str, - object_var: &'a str, - both_vars_bound: &Arc>>, - subject_var_bound: &Arc>>, - object_var_bound: &Arc>>, - neither_var_bound: &Arc>, - final_results_arc: &Arc>>, - local_results: &mut Vec>, -) { - // Check both_vars_bound - most restrictive case first - if let Some(result_indices) = both_vars_bound.get(&(subject.to_string(), object.to_string())) { - for &idx in result_indices { - local_results.push(final_results_arc[idx].clone()); - } - } - - // Process subject_var_bound - if let Some(result_indices) = subject_var_bound.get(subject) { - for &idx in result_indices { - let base_result = &final_results_arc[idx]; - // Check for object consistency if it exists - if let Some(existing_object) = base_result.get(object_var) { - if existing_object == object { - local_results.push(base_result.clone()); - } - } else { - // Bind the object variable - let mut extended_result = base_result.clone(); - extended_result.insert(object_var, object.to_string()); - local_results.push(extended_result); - } - } - } - - // Process object_var_bound - if let Some(result_indices) = object_var_bound.get(object) { - for &idx in result_indices { - let base_result = &final_results_arc[idx]; - // Check for subject consistency if it exists - if let Some(existing_subject) = base_result.get(subject_var) { - if existing_subject == subject { - local_results.push(base_result.clone()); - } - } else { - // Bind the subject variable - let mut extended_result = base_result.clone(); - extended_result.insert(subject_var, subject.to_string()); - local_results.push(extended_result); - } - } - } - - // Process neither_var_bound - least restrictive case last - for &idx in neither_var_bound.iter() { - let base_result = &final_results_arc[idx]; - - // Check both consistency constraints - let subject_consistent = base_result - .get(subject_var) - .map_or(true, |existing| existing == subject); - let object_consistent = base_result - .get(object_var) - .map_or(true, |existing| existing == object); - - if subject_consistent && object_consistent { - let mut extended_result = base_result.clone(); - - // Only insert if not already present - if !base_result.contains_key(subject_var) { - extended_result.insert(subject_var, subject.to_string()); - } - if !base_result.contains_key(object_var) { - extended_result.insert(object_var, object.to_string()); - } - - local_results.push(extended_result); - } - } -} + // Check both consistency constraints + let subject_consistent = base_result + .get(subject_var) + .map_or(true, |existing| existing == subject); + let object_consistent = base_result + .get(object_var) + .map_or(true, |existing| existing == object); -#[inline(always)] -fn process_triple_chunk<'a>( - triple_chunk: &[Triple], - predicate_bytes: &[u8], - literal_filter_bytes: &Option<&[u8]>, - subject_var: &'a str, - object_var: &'a str, - both_vars_bound: &Arc>>, - subject_var_bound: &Arc>>, - object_var_bound: &Arc>>, - neither_var_bound: &Arc>, - final_results_arc: &Arc>>, - local_results: &mut Vec>, - dictionary: &'a Dictionary, -) { - // Pre-filter triples to avoid unnecessary decoding - for triple in triple_chunk { - let pred_opt = dictionary.decode(triple.predicate); - if pred_opt.is_none() || pred_opt.as_ref().unwrap().as_bytes() != predicate_bytes { - continue; - } - - if let Some(filter_bytes) = literal_filter_bytes { - let obj_opt = dictionary.decode(triple.object); - if obj_opt.is_none() || obj_opt.as_ref().unwrap().as_bytes() != *filter_bytes { - continue; - } - - if let Some(subj) = dictionary.decode(triple.subject) { - process_join_efficiently( - &subj, - obj_opt.unwrap(), - subject_var, - object_var, - both_vars_bound, - subject_var_bound, - object_var_bound, - neither_var_bound, - final_results_arc, - local_results, - ); - } - } else { - let subj_opt = dictionary.decode(triple.subject); - let obj_opt = dictionary.decode(triple.object); - - if let (Some(subj), Some(obj)) = (subj_opt, obj_opt) { - process_join_efficiently( - &subj, - &obj, - subject_var, - object_var, - both_vars_bound, - subject_var_bound, - object_var_bound, - neither_var_bound, - final_results_arc, - local_results, - ); - } - } - } -} + if subject_consistent && object_consistent { + let mut extended_result = base_result.clone(); + // Only insert if not already present + if !base_result.contains_key(subject_var) { + extended_result.insert(subject_var, subject.to_string()); + } + if !base_result.contains_key(object_var) { + extended_result.insert(object_var, object.to_string()); + } -#[inline(always)] -fn process_join_efficiently<'a>( - subject: &str, - object: &str, - subject_var: &'a str, - object_var: &'a str, - both_vars_bound: &Arc>>, - subject_var_bound: &Arc>>, - object_var_bound: &Arc>>, - neither_var_bound: &Arc>, - final_results_arc: &Arc>>, - local_results: &mut Vec>, -) { - if let Some(result_indices) = both_vars_bound.get(&(subject.to_string(), object.to_string())) { - for &idx in result_indices { - // Clone efficiently with pre-allocation - let result = final_results_arc[idx].clone(); - local_results.push(result); - } - return; // Early return after handling the most restrictive case - } - - // Check for subject var bound - second most restrictive - if let Some(result_indices) = subject_var_bound.get(subject) { - for &idx in result_indices { - let base_result = &final_results_arc[idx]; - // Check for object consistency if it exists - if let Some(existing_object) = base_result.get(object_var) { - if existing_object == object { - local_results.push(base_result.clone()); - } - } else { - let mut extended_result = base_result.clone(); - extended_result.insert(object_var, object.to_string()); - local_results.push(extended_result); - } - } - } - - // Check for object var bound - if let Some(result_indices) = object_var_bound.get(object) { - for &idx in result_indices { - let base_result = &final_results_arc[idx]; - if let Some(existing_subject) = base_result.get(subject_var) { - if existing_subject == subject { - local_results.push(base_result.clone()); - } - } else { - let mut extended_result = base_result.clone(); - extended_result.insert(subject_var, subject.to_string()); - local_results.push(extended_result); - } - } - } - - // Process least restrictive case - neither var bound - for &idx in neither_var_bound.iter() { - let base_result = &final_results_arc[idx]; - - // Check both consistency constraints - let subject_consistent = base_result - .get(subject_var) - .map_or(true, |existing| existing == subject); - let object_consistent = base_result - .get(object_var) - .map_or(true, |existing| existing == object); - - if subject_consistent && object_consistent { - let mut extended_result = base_result.clone(); - - // Only insert if not already present - if !base_result.contains_key(subject_var) { - extended_result.insert(subject_var, subject.to_string()); - } - if !base_result.contains_key(object_var) { - extended_result.insert(object_var, object.to_string()); - } - - local_results.push(extended_result); - } - } + local_results.push(extended_result); + } + } } - diff --git a/shared/src/index_manager/buckets.rs b/shared/src/index_manager/buckets.rs index 45351e4..4418c50 100644 --- a/shared/src/index_manager/buckets.rs +++ b/shared/src/index_manager/buckets.rs @@ -1,314 +1,517 @@ -use serde::{Serialize, Deserialize}; - -use std::collections::{HashSet, HashMap}; +use std::collections::{HashMap, HashSet}; +use crate::index_manager::*; +use crate::query::PlannedAccessPattern; use crate::terms::*; use crate::triple::Triple; -use crate::index_manager::*; -// ── helpers ────────────────────────────────────────────────────────────────── - -fn get_triple_field(triple: &Triple, pos: usize) -> u32 { - match pos { - 0 => triple.subject, - 1 => triple.predicate, - 2 => triple.object, - _ => panic!("invalid position {pos}"), - } -} - -fn is_one_constant_pattern(pattern: &TriplePattern) -> bool { - let (s, p, o) = pattern; - matches!( - (s, p, o), - (Constant(_), Variable(_), Variable(_)) - | (Variable(_), Constant(_), Variable(_)) - | (Variable(_), Variable(_), Constant(_)) - ) +#[derive(Debug, Clone)] +pub enum BucketStore { + // 0 Dynamic Variables + D0_F0(bool), + D0_F1(HashSet), + D0_F2(HashSet<[u32; 2]>), + D0_F3(HashSet<[u32; 3]>), + + // >0 Dynamic Variables, 0 Free Variables (Existence checks) + D1_F0(HashSet), + D2_F0(HashSet<[u32; 2]>), + D3_F0(HashSet<[u32; 3]>), + + // >0 Dynamic Variables, >0 Free Variables (Map lookups) + D1_F1(HashMap>), + D1_F2(HashMap>), + D2_F1(HashMap<[u32; 2], HashSet>), } -// ── TwoWayData ──────────────────────────────────────────────────────────────── - -/// Bidirectional index for buckets with exactly one bound constant. -/// -/// Given a pattern like `(?s, C, ?o)`: -/// - `pos_a = 0` (subject), `pos_b = 2` (object), `const_pos = 1`, `const_val = C` -/// - `forward`: subject → { objects … } -/// - `backward`: object → { subjects … } -/// -/// A query that binds `pos_a` (e.g. `bound_s, C, ?o`) is served by a single -/// `forward` lookup instead of iterating the whole bucket. #[derive(Debug, Clone)] -pub struct TwoWayData { - pos_a: usize, - pos_b: usize, - const_pos: usize, - const_val: u32, - forward: HashMap>, // pos_a_val → { pos_b_val, … } - backward: HashMap>, // pos_b_val → { pos_a_val, … } +pub struct DirectedBucket { + pub pattern: TriplePattern, + pub c_positions: Vec, // Constants + pub d_positions: Vec, // Dynamic (Pipeline-bound) + pub f_positions: Vec, // Free (Unbound) + pub c_values: Vec, // The actual constant values + pub data: BucketStore, } -impl TwoWayData { - fn from_pattern(pattern: &TriplePattern) -> Self { - let (s, p, o) = pattern; - let mut free = Vec::new(); - let mut const_pos = 0; - let mut const_val = 0u32; - - for (i, term) in [s, p, o].iter().enumerate() { - match term { - Variable(_) => free.push(i), - Constant(c) => { const_pos = i; const_val = *c; } +impl DirectedBucket { + pub fn new(planned: PlannedAccessPattern) -> Self { + let mut c_positions = Vec::new(); + let mut d_positions = Vec::new(); + let mut f_positions = Vec::new(); + let mut c_values = Vec::new(); + + let mut check_pos = |term: &Term, is_bound: bool, pos: usize| match term { + Term::Constant(c) => { + c_positions.push(pos); + c_values.push(*c); } - } + Term::Variable(_) => { + if is_bound { + d_positions.push(pos); + } else { + f_positions.push(pos); + } + } + }; - assert_eq!(free.len(), 2, "TwoWayData requires exactly one constant"); + check_pos(&planned.pattern.0, planned.bound_subject, 0); + check_pos(&planned.pattern.1, planned.bound_predicate, 1); + check_pos(&planned.pattern.2, planned.bound_object, 2); + + let data = match (d_positions.len(), f_positions.len()) { + (0, 0) => BucketStore::D0_F0(false), + (0, 1) => BucketStore::D0_F1(HashSet::new()), + (0, 2) => BucketStore::D0_F2(HashSet::new()), + (0, 3) => BucketStore::D0_F3(HashSet::new()), + (1, 0) => BucketStore::D1_F0(HashSet::new()), + (2, 0) => BucketStore::D2_F0(HashSet::new()), + (3, 0) => BucketStore::D3_F0(HashSet::new()), + (1, 1) => BucketStore::D1_F1(HashMap::new()), + (1, 2) => BucketStore::D1_F2(HashMap::new()), + (2, 1) => BucketStore::D2_F1(HashMap::new()), + _ => unreachable!("Invalid number of variables in triple"), + }; Self { - pos_a: free[0], pos_b: free[1], - const_pos, const_val, - forward: HashMap::new(), backward: HashMap::new(), + pattern: planned.pattern, + c_positions, + d_positions, + f_positions, + c_values, + data, } } - fn build_triple(&self, a: u32, b: u32) -> Triple { - let mut vals = [0u32; 3]; - vals[self.pos_a] = a; - vals[self.pos_b] = b; - vals[self.const_pos] = self.const_val; - Triple { subject: vals[0], predicate: vals[1], object: vals[2] } + #[inline(always)] + fn get_triple_field(triple: &Triple, pos: usize) -> u32 { + match pos { + 0 => triple.subject, + 1 => triple.predicate, + 2 => triple.object, + _ => unreachable!(), + } } - fn insert(&mut self, triple: &Triple) -> bool { - let a = get_triple_field(triple, self.pos_a); - let b = get_triple_field(triple, self.pos_b); - let inserted = self.forward.entry(a).or_default().insert(b); - self.backward.entry(b).or_default().insert(a); - inserted + pub fn matches(&self, triple: &Triple) -> bool { + for (i, &pos) in self.c_positions.iter().enumerate() { + if Self::get_triple_field(triple, pos) != self.c_values[i] { + return false; + } + } + true } - fn remove(&mut self, triple: &Triple) -> bool { - let a = get_triple_field(triple, self.pos_a); - let b = get_triple_field(triple, self.pos_b); - - let removed = if let Some(set) = self.forward.get_mut(&a) { - let r = set.remove(&b); - if set.is_empty() { self.forward.remove(&a); } - r - } else { false }; + pub fn insert(&mut self, triple: &Triple) -> bool { + let get_val = |pos| Self::get_triple_field(triple, pos); - if removed { - if let Some(set) = self.backward.get_mut(&b) { - set.remove(&a); - if set.is_empty() { self.backward.remove(&b); } + match &mut self.data { + BucketStore::D0_F0(b) => { + let old = *b; + *b = true; + !old } + BucketStore::D0_F1(s) => s.insert(get_val(self.f_positions[0])), + BucketStore::D0_F2(s) => { + s.insert([get_val(self.f_positions[0]), get_val(self.f_positions[1])]) + } + BucketStore::D0_F3(s) => s.insert([ + get_val(self.f_positions[0]), + get_val(self.f_positions[1]), + get_val(self.f_positions[2]), + ]), + BucketStore::D1_F0(s) => s.insert(get_val(self.d_positions[0])), + BucketStore::D2_F0(s) => { + s.insert([get_val(self.d_positions[0]), get_val(self.d_positions[1])]) + } + BucketStore::D3_F0(s) => s.insert([ + get_val(self.d_positions[0]), + get_val(self.d_positions[1]), + get_val(self.d_positions[2]), + ]), + BucketStore::D1_F1(m) => m + .entry(get_val(self.d_positions[0])) + .or_default() + .insert(get_val(self.f_positions[0])), + BucketStore::D1_F2(m) => m + .entry(get_val(self.d_positions[0])) + .or_default() + .insert([get_val(self.f_positions[0]), get_val(self.f_positions[1])]), + BucketStore::D2_F1(m) => m + .entry([get_val(self.d_positions[0]), get_val(self.d_positions[1])]) + .or_default() + .insert(get_val(self.f_positions[0])), } - - removed } - /// Query using `q[0..=2]` = `[s, p, o]` as `Option`. - /// The constant position is already guaranteed to match by the time we get here. - fn query(&self, q: [Option; 3]) -> Vec { - let qa = q[self.pos_a]; - let qb = q[self.pos_b]; - - match (qa, qb) { - // One free dimension bound → single hashmap lookup, O(output) - (Some(a), None) => { - self.forward.get(&a).map_or(Vec::new(), |bs| { - bs.iter().map(|&b| self.build_triple(a, b)).collect() - }) - } - (None, Some(b)) => { - self.backward.get(&b).map_or(Vec::new(), |as_| { - as_.iter().map(|&a| self.build_triple(a, b)).collect() - }) - } - // Both free dimensions bound → existence check - (Some(a), Some(b)) => { - if self.forward.get(&a).map_or(false, |bs| bs.contains(&b)) { - vec![self.build_triple(a, b)] + pub fn remove(&mut self, triple: &Triple) -> bool { + let get_val = |pos| Self::get_triple_field(triple, pos); + + match &mut self.data { + BucketStore::D0_F0(b) => { + let old = *b; + *b = false; + old + } + BucketStore::D0_F1(s) => s.remove(&get_val(self.f_positions[0])), + BucketStore::D0_F2(s) => { + s.remove(&[get_val(self.f_positions[0]), get_val(self.f_positions[1])]) + } + BucketStore::D0_F3(s) => s.remove(&[ + get_val(self.f_positions[0]), + get_val(self.f_positions[1]), + get_val(self.f_positions[2]), + ]), + BucketStore::D1_F0(s) => s.remove(&get_val(self.d_positions[0])), + BucketStore::D2_F0(s) => { + s.remove(&[get_val(self.d_positions[0]), get_val(self.d_positions[1])]) + } + BucketStore::D3_F0(s) => s.remove(&[ + get_val(self.d_positions[0]), + get_val(self.d_positions[1]), + get_val(self.d_positions[2]), + ]), + BucketStore::D1_F1(m) => { + let k = get_val(self.d_positions[0]); + if let Some(set) = m.get_mut(&k) { + let removed = set.remove(&get_val(self.f_positions[0])); + if set.is_empty() { + m.remove(&k); + } + removed + } else { + false + } + } + BucketStore::D1_F2(m) => { + let k = get_val(self.d_positions[0]); + if let Some(set) = m.get_mut(&k) { + let removed = + set.remove(&[get_val(self.f_positions[0]), get_val(self.f_positions[1])]); + if set.is_empty() { + m.remove(&k); + } + removed } else { - Vec::new() + false } } - // Nothing extra bound → dump everything - (None, None) => { - self.forward.iter() - .flat_map(|(&a, bs)| bs.iter().map(move |&b| (a, b))) - .map(|(a, b)| self.build_triple(a, b)) - .collect() + BucketStore::D2_F1(m) => { + let k = [get_val(self.d_positions[0]), get_val(self.d_positions[1])]; + if let Some(set) = m.get_mut(&k) { + let removed = set.remove(&get_val(self.f_positions[0])); + if set.is_empty() { + m.remove(&k); + } + removed + } else { + false + } } } } - fn triple_count(&self) -> usize { - self.forward.values().map(|s| s.len()).sum() - } - - fn clear(&mut self) { - self.forward.clear(); - self.backward.clear(); - } - - fn shrink_to_fit(&mut self) { - for s in self.forward.values_mut() { s.shrink_to_fit(); } - for s in self.backward.values_mut() { s.shrink_to_fit(); } - self.forward.shrink_to_fit(); - self.backward.shrink_to_fit(); - } -} - -// ── BucketData ──────────────────────────────────────────────────────────────── - -#[derive(Debug, Clone)] -pub enum BucketData { - /// Patterns with 0, 2, or 3 constants – a flat set is fine. - Simple(HashSet), - /// Pattern with exactly 1 constant – bidirectional maps for O(output) lookups. - TwoWay(TwoWayData), -} - -// ── Bucket ──────────────────────────────────────────────────────────────────── + pub fn query(&self, s: Option, p: Option, o: Option) -> Vec { + let mut results = Vec::new(); -#[derive(Debug, Clone)] -pub struct Bucket { - pub pattern: TriplePattern, - pub data: BucketData, -} + // Extract constants and queried dynamics once per query call + let mut t_base = [0u32; 3]; + for (i, &pos) in self.c_positions.iter().enumerate() { + t_base[pos] = self.c_values[i]; + } + for &pos in &self.d_positions { + t_base[pos] = match pos { + 0 => s.unwrap(), + 1 => p.unwrap(), + 2 => o.unwrap(), + _ => unreachable!(), + }; + } -impl Bucket { - pub fn new(pattern: TriplePattern) -> Self { - let data = if is_one_constant_pattern(&pattern) { - BucketData::TwoWay(TwoWayData::from_pattern(&pattern)) - } else { - BucketData::Simple(HashSet::new()) + // Inline macro/closure to quickly instantiate the triple without inner loops + let mut push_res = |f_vals: &[u32]| { + let mut t = t_base; + for (i, &pos) in self.f_positions.iter().enumerate() { + t[pos] = f_vals[i]; + } + results.push(Triple { + subject: t[0], + predicate: t[1], + object: t[2], + }); }; - Self { pattern, data } - } - pub fn matches(&self, triple: &Triple) -> bool { - let (s, p, o) = &self.pattern; - let s_ok = match s { Constant(c) => triple.subject == *c, Variable(_) => true }; - let p_ok = match p { Constant(c) => triple.predicate == *c, Variable(_) => true }; - let o_ok = match o { Constant(c) => triple.object == *c, Variable(_) => true }; - s_ok && p_ok && o_ok - } - - pub fn insert(&mut self, triple: &Triple) -> bool { - match &mut self.data { - BucketData::Simple(set) => set.insert(triple.clone()), - BucketData::TwoWay(tw) => tw.insert(triple), + match &self.data { + BucketStore::D0_F0(b) => { + if *b { + push_res(&[]); + } + } + BucketStore::D0_F1(set) => { + for &f in set { + push_res(&[f]); + } + } + BucketStore::D0_F2(set) => { + for &f in set { + push_res(&f); + } + } + BucketStore::D0_F3(set) => { + for &f in set { + push_res(&f); + } + } + BucketStore::D1_F0(set) => { + if set.contains(&t_base[self.d_positions[0]]) { + push_res(&[]); + } + } + BucketStore::D2_F0(set) => { + if set.contains(&[t_base[self.d_positions[0]], t_base[self.d_positions[1]]]) { + push_res(&[]); + } + } + BucketStore::D3_F0(set) => { + if set.contains(&[ + t_base[self.d_positions[0]], + t_base[self.d_positions[1]], + t_base[self.d_positions[2]], + ]) { + push_res(&[]); + } + } + BucketStore::D1_F1(map) => { + if let Some(set) = map.get(&t_base[self.d_positions[0]]) { + for &f in set { + push_res(&[f]); + } + } + } + BucketStore::D1_F2(map) => { + if let Some(set) = map.get(&t_base[self.d_positions[0]]) { + for &f in set { + push_res(&f); + } + } + } + BucketStore::D2_F1(map) => { + if let Some(set) = + map.get(&[t_base[self.d_positions[0]], t_base[self.d_positions[1]]]) + { + for &f in set { + push_res(&[f]); + } + } + } } - } - pub fn remove(&mut self, triple: &Triple) -> bool { - match &mut self.data { - BucketData::Simple(set) => set.remove(triple), - BucketData::TwoWay(tw) => tw.remove(triple), - } + // Return immediately without the slow `.into_iter().filter().collect()` + results } - pub fn triple_count(&self) -> usize { + pub fn get_all_triples(&self) -> Vec { + let mut results = Vec::new(); + + let reconstruct = |d_vals: &[u32], f_vals: &[u32]| { + let mut t = [0; 3]; + for (i, &pos) in self.c_positions.iter().enumerate() { + t[pos] = self.c_values[i]; + } + for (i, &pos) in self.d_positions.iter().enumerate() { + t[pos] = d_vals[i]; + } + for (i, &pos) in self.f_positions.iter().enumerate() { + t[pos] = f_vals[i]; + } + Triple { + subject: t[0], + predicate: t[1], + object: t[2], + } + }; + match &self.data { - BucketData::Simple(set) => set.len(), - BucketData::TwoWay(tw) => tw.triple_count(), + BucketStore::D0_F0(b) => { + if *b { + results.push(reconstruct(&[], &[])); + } + } + BucketStore::D0_F1(set) => { + for &f in set { + results.push(reconstruct(&[], &[f])); + } + } + BucketStore::D0_F2(set) => { + for &f in set { + results.push(reconstruct(&[], &f)); + } + } + BucketStore::D0_F3(set) => { + for &f in set { + results.push(reconstruct(&[], &f)); + } + } + BucketStore::D1_F0(set) => { + for &d in set { + results.push(reconstruct(&[d], &[])); + } + } + BucketStore::D2_F0(set) => { + for &d in set { + results.push(reconstruct(&d, &[])); + } + } + BucketStore::D3_F0(set) => { + for &d in set { + results.push(reconstruct(&d, &[])); + } + } + BucketStore::D1_F1(map) => { + for (&d, set) in map { + for &f in set { + results.push(reconstruct(&[d], &[f])); + } + } + } + BucketStore::D1_F2(map) => { + for (&d, set) in map { + for &f in set { + results.push(reconstruct(&[d], &f)); + } + } + } + BucketStore::D2_F1(map) => { + for (&d, set) in map { + for &f in set { + results.push(reconstruct(&d, &[f])); + } + } + } } + results } pub fn clear(&mut self) { match &mut self.data { - BucketData::Simple(set) => set.clear(), - BucketData::TwoWay(tw) => tw.clear(), + BucketStore::D0_F0(b) => *b = false, + BucketStore::D0_F1(s) => s.clear(), + BucketStore::D0_F2(s) => s.clear(), + BucketStore::D0_F3(s) => s.clear(), + BucketStore::D1_F0(s) => s.clear(), + BucketStore::D2_F0(s) => s.clear(), + BucketStore::D3_F0(s) => s.clear(), + BucketStore::D1_F1(m) => m.clear(), + BucketStore::D1_F2(m) => m.clear(), + BucketStore::D2_F1(m) => m.clear(), } } pub fn shrink_to_fit(&mut self) { match &mut self.data { - BucketData::Simple(set) => set.shrink_to_fit(), - BucketData::TwoWay(tw) => tw.shrink_to_fit(), - } - } - - /// Return triples that match the given optional bindings. - /// Callers must ensure the bucket covers the query (i.e. `bucket_covers_query` passed). - pub fn query(&self, s: Option, p: Option, o: Option) -> Vec { - match &self.data { - BucketData::Simple(set) => { - set.iter() - .filter(|t| { - (s.is_none() || s == Some(t.subject)) && - (p.is_none() || p == Some(t.predicate)) && - (o.is_none() || o == Some(t.object)) - }) - .cloned() - .collect() - } - BucketData::TwoWay(tw) => tw.query([s, p, o]), + BucketStore::D0_F0(_) => {} + BucketStore::D0_F1(s) => s.shrink_to_fit(), + BucketStore::D0_F2(s) => s.shrink_to_fit(), + BucketStore::D0_F3(s) => s.shrink_to_fit(), + BucketStore::D1_F0(s) => s.shrink_to_fit(), + BucketStore::D2_F0(s) => s.shrink_to_fit(), + BucketStore::D3_F0(s) => s.shrink_to_fit(), + BucketStore::D1_F1(m) => { + for v in m.values_mut() { + v.shrink_to_fit(); + } + m.shrink_to_fit(); + } + BucketStore::D1_F2(m) => { + for v in m.values_mut() { + v.shrink_to_fit(); + } + m.shrink_to_fit(); + } + BucketStore::D2_F1(m) => { + for v in m.values_mut() { + v.shrink_to_fit(); + } + m.shrink_to_fit(); + } } } } -// ── BucketIndex ─────────────────────────────────────────────────────────────── - #[derive(Debug, Clone)] pub struct BucketIndex { - pub buckets: Vec, + pub buckets: Vec, } impl BucketIndex { - pub fn new(patterns: Vec) -> Self { - let mut unique_patterns: Vec = Vec::new(); - - for p in patterns { - let is_dup = unique_patterns.iter().any(|e| Self::patterns_equivalent(e, &p)); - if !is_dup { unique_patterns.push(p); } - } - - println!("--- BucketIndex Initialization ---"); - println!("Requested patterns: {}, Unique buckets created: {}", unique_patterns.len(), unique_patterns.len()); - if unique_patterns.is_empty() { - println!("WARNING: BucketIndex initialized with 0 patterns! No data will be stored."); - } - - let buckets = unique_patterns.into_iter().enumerate().map(|(i, pat)| { - println!(" Bucket [{}]: {:?}", i, pat); - Bucket::new(pat) - }).collect(); - + pub fn new(patterns: Vec) -> Self { + println!("[Bucket Debug] --- BucketIndex Initialization ---"); + println!( + "[Bucket Debug] Requested planned patterns: {}", + patterns.len() + ); + let buckets: Vec = patterns + .into_iter() + .enumerate() + .map(|(i, pat)| { + let b = DirectedBucket::new(pat); + println!( + "[Bucket Debug] Bucket [{}]: Pattern: {:?}, C={:?}, D={:?}, F={:?}", + i, b.pattern, b.c_positions, b.d_positions, b.f_positions + ); + b + }) + .collect(); Self { buckets } } - fn patterns_equivalent(p1: &TriplePattern, p2: &TriplePattern) -> bool { - let match_term = |t1: &Term, t2: &Term| match (t1, t2) { - (Constant(c1), Constant(c2)) => c1 == c2, - (Variable(_), Variable(_)) => true, - _ => false, - }; - match_term(&p1.0, &p2.0) && match_term(&p1.1, &p2.1) && match_term(&p1.2, &p2.2) - } - - fn bucket_covers_query(bucket_pat: &TriplePattern, q_s: Option, q_p: Option, q_o: Option) -> bool { + fn bucket_covers_query( + bucket_pat: &TriplePattern, + q_s: Option, + q_p: Option, + q_o: Option, + ) -> bool { let (b_s, b_p, b_o) = bucket_pat; - let s_safe = match b_s { Variable(_) => true, Constant(c) => q_s == Some(*c) }; - let p_safe = match b_p { Variable(_) => true, Constant(c) => q_p == Some(*c) }; - let o_safe = match b_o { Variable(_) => true, Constant(c) => q_o == Some(*c) }; + let s_safe = match b_s { + Variable(_) => true, + Constant(c) => q_s == Some(*c), + }; + let p_safe = match b_p { + Variable(_) => true, + Constant(c) => q_p == Some(*c), + }; + let o_safe = match b_o { + Variable(_) => true, + Constant(c) => q_o == Some(*c), + }; s_safe && p_safe && o_safe } - - fn is_exact_match(bucket_pat: &TriplePattern, q_s: Option, q_p: Option, q_o: Option) -> bool { - let (b_s, b_p, b_o) = bucket_pat; - let s_ok = match b_s { Constant(c) => q_s == Some(*c), Variable(_) => q_s.is_none() }; - let p_ok = match b_p { Constant(c) => q_p == Some(*c), Variable(_) => q_p.is_none() }; - let o_ok = match b_o { Constant(c) => q_o == Some(*c), Variable(_) => q_o.is_none() }; - s_ok && p_ok && o_ok - } } impl TripleIndex for BucketIndex { fn clone_empty(&self) -> Box { - let patterns = self.buckets.iter().map(|b| b.pattern.clone()).collect(); + let mut patterns = Vec::new(); + for b in &self.buckets { + let mut bound_subject = false; + let mut bound_predicate = false; + let mut bound_object = false; + + for &pos in &b.d_positions { + match pos { + 0 => bound_subject = true, + 1 => bound_predicate = true, + 2 => bound_object = true, + _ => {} + } + } + + patterns.push(PlannedAccessPattern { + pattern: b.pattern.clone(), + bound_subject, + bound_predicate, + bound_object, + }); + } Box::new(BucketIndex::new(patterns)) } @@ -317,19 +520,22 @@ impl TripleIndex for BucketIndex { } fn triple_count(&self) -> usize { - // Buckets may overlap, so deduplicate. - // For TwoWay buckets we reconstruct triples on the fly; this is the one - // place where the extra memory of TwoWay costs a bit more to count. let mut unique: HashSet = HashSet::new(); for bucket in &self.buckets { - // query(None,None,None) works for both Simple and TwoWay - unique.extend(bucket.query(None, None, None)); + unique.extend(bucket.get_all_triples()); } unique.len() } fn supported_access_patterns(&self) -> AccessPatternSupport { - AccessPatternSupport { sp: false, so: false, po: false, ps: false, os: false, op: false } + AccessPatternSupport { + sp: false, + so: false, + po: false, + ps: false, + os: false, + op: false, + } } fn insert(&mut self, triple: &Triple) -> bool { @@ -345,7 +551,7 @@ impl TripleIndex for BucketIndex { fn delete(&mut self, triple: &Triple) -> bool { let mut deleted_anywhere = false; for bucket in &mut self.buckets { - if bucket.remove(triple) { + if bucket.matches(triple) && bucket.remove(triple) { deleted_anywhere = true; } } @@ -354,59 +560,95 @@ impl TripleIndex for BucketIndex { fn build_from_triples(&mut self, triples: &[Triple]) { self.clear(); - - println!("Building BucketIndex with {} triples across {} buckets...", triples.len(), self.buckets.len()); - - if self.buckets.is_empty() { - println!("WARNING: Cannot build from triples because 0 buckets exist!"); - return; - } - - let mut insert_count = 0; for triple in triples { - if self.insert(triple) { insert_count += 1; } + self.insert(triple); } - - println!("Finished building. {}/{} triples matched at least one bucket.", insert_count, triples.len()); self.optimize(); } fn query(&self, s: Option, p: Option, o: Option) -> Vec { - // Exact match: the bucket pattern mirrors the query exactly, no extra filtering. - if let Some(b) = self.buckets.iter().find(|b| Self::is_exact_match(&b.pattern, s, p, o)) { - return b.query(s, p, o); - } + let provided_positions = { + let mut pos = Vec::new(); + if s.is_some() { + pos.push(0); + } + if p.is_some() { + pos.push(1); + } + if o.is_some() { + pos.push(2); + } + pos + }; - // Covering match: bucket is more general; Bucket::query() handles the filtering, - // and TwoWay buckets do it in O(output) via a hashmap lookup. - if let Some(b) = self.buckets.iter().find(|b| Self::bucket_covers_query(&b.pattern, s, p, o)) { - return b.query(s, p, o); + for b in self.buckets.iter() { + if Self::bucket_covers_query(&b.pattern, s, p, o) { + // Check if lengths match first to avoid allocating and sorting unless necessary + if b.c_positions.len() + b.d_positions.len() == provided_positions.len() { + let mut expected_provided = b.c_positions.clone(); + expected_provided.extend(&b.d_positions); + expected_provided.sort_unstable(); + + if expected_provided == provided_positions { + return b.query(s, p, o); + } + } + } } - eprintln!("Warning: Query {:?} {:?} {:?} is too general for the existing buckets. Returning empty.", s, p, o); - Vec::new() + panic!( + "[FATAL] NO EXACT MATCH FOUND! Query cannot be satisfied optimally by any bucket.\n\ + Query required: s={:?}, p={:?}, o={:?}\n\ + Provided Positions: {:?}", + s, p, o, provided_positions + ); } fn get_matching_triples(&self, pattern: &TriplePattern) -> Vec { let (s, p, o) = pattern; - let sub = match s { Constant(x) => Some(*x), Variable(_) => None }; - let pre = match p { Constant(x) => Some(*x), Variable(_) => None }; - let obj = match o { Constant(x) => Some(*x), Variable(_) => None }; + let sub = match s { + Constant(x) => Some(*x), + Variable(_) => None, + }; + let pre = match p { + Constant(x) => Some(*x), + Variable(_) => None, + }; + let obj = match o { + Constant(x) => Some(*x), + Variable(_) => None, + }; self.query(sub, pre, obj) } fn clear(&mut self) { - for bucket in &mut self.buckets { bucket.clear(); } + for bucket in &mut self.buckets { + bucket.clear(); + } } - fn scan_sp(&self, _: u32, _: u32) -> Option<&HashSet> { None } - fn scan_so(&self, _: u32, _: u32) -> Option<&HashSet> { None } - fn scan_po(&self, _: u32, _: u32) -> Option<&HashSet> { None } - fn scan_ps(&self, _: u32, _: u32) -> Option<&HashSet> { None } - fn scan_os(&self, _: u32, _: u32) -> Option<&HashSet> { None } - fn scan_op(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_sp(&self, _: u32, _: u32) -> Option<&HashSet> { + None + } + fn scan_so(&self, _: u32, _: u32) -> Option<&HashSet> { + None + } + fn scan_po(&self, _: u32, _: u32) -> Option<&HashSet> { + None + } + fn scan_ps(&self, _: u32, _: u32) -> Option<&HashSet> { + None + } + fn scan_os(&self, _: u32, _: u32) -> Option<&HashSet> { + None + } + fn scan_op(&self, _: u32, _: u32) -> Option<&HashSet> { + None + } fn optimize(&mut self) { - for bucket in &mut self.buckets { bucket.shrink_to_fit(); } + for bucket in &mut self.buckets { + bucket.shrink_to_fit(); + } } } diff --git a/shared/src/query.rs b/shared/src/query.rs index 2b205b0..519116c 100644 --- a/shared/src/query.rs +++ b/shared/src/query.rs @@ -11,6 +11,16 @@ use std::collections::HashMap; use std::time::Duration; +use crate::terms::TriplePattern; + +#[derive(Debug, Clone)] +pub struct PlannedAccessPattern { + pub pattern: TriplePattern, + pub bound_subject: bool, + pub bound_predicate: bool, + pub bound_object: bool, +} + #[derive(Debug, Clone)] pub enum FilterExpression<'a> { Comparison(&'a str, &'a str, &'a str), From ac150b8761397002202ec193207034a28237e088 Mon Sep 17 00:00:00 2001 From: Mirovh <94124714+Mirovh@users.noreply.github.com> Date: Sat, 18 Apr 2026 22:52:29 +0200 Subject: [PATCH 17/23] Implement partial hexastore --- Cargo.lock | 1 - .../n_triples_data/n_triple_10M.rs | 289 ++++++++++------ .../n_triple_10M_all_indexes.sh | 4 +- kolibrie/src/sparql_database.rs | 103 ++++-- shared/src/index_manager/mod.rs | 6 + shared/src/index_manager/partial_hexastore.rs | 325 ++++++++++++++++++ 6 files changed, 583 insertions(+), 145 deletions(-) create mode 100644 shared/src/index_manager/partial_hexastore.rs diff --git a/Cargo.lock b/Cargo.lock index 7d44e47..b31dca3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1438,7 +1438,6 @@ dependencies = [ "rayon", "serde", "serde_json", - "sysinfo", ] [[package]] diff --git a/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M.rs b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M.rs index 7a15a1c..9c0412e 100644 --- a/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M.rs +++ b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M.rs @@ -25,18 +25,20 @@ use kolibrie::execute_query::*; use kolibrie::sparql_database::*; -use std::fs::File; -use std::io::{BufRead, BufReader}; -use std::time::Instant; use shared::index_manager::*; +use std::collections::{BTreeMap, HashSet}; +use std::fs::{self, File}; +use std::io::{BufRead, BufReader, Write}; +use std::path::Path; +use std::time::Instant; type QuerySpec = (&'static str, &'static str); fn workload_queries() -> Vec { vec![ ( - "C1", - r#"PREFIX wsdbm: + "C1", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -59,11 +61,11 @@ fn workload_queries() -> Vec { ?v7 sorg:language ?v8 . } "#, - ), - // C2 - ( - "C2", - r#"PREFIX wsdbm: + ), + // C2 + ( + "C2", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -87,11 +89,11 @@ fn workload_queries() -> Vec { ?v8 rev:totalVotes ?v9 . } "#, - ), - // C3 - ( - "C3", - r#"PREFIX wsdbm: + ), + // C3 + ( + "C3", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -112,11 +114,11 @@ fn workload_queries() -> Vec { ?v0 foaf:givenName ?v6 . } "#, - ), - // F1 - ( - "F1", - r#"PREFIX wsdbm: + ), + // F1 + ( + "F1", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -136,11 +138,11 @@ fn workload_queries() -> Vec { ?v3 rdf:type wsdbm:ProductCategory2 . } "#, - ), - // F2 - ( - "F2", - r#"PREFIX wsdbm: + ), + // F2 + ( + "F2", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -162,11 +164,11 @@ fn workload_queries() -> Vec { ?v0 wsdbm:hasGenre . } "#, - ), - // F3 - ( - "F3", - r#"PREFIX wsdbm: + ), + // F3 + ( + "F3", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -186,11 +188,11 @@ fn workload_queries() -> Vec { ?v5 wsdbm:purchaseFor ?v0 . } "#, - ), - // F4 - ( - "F4", - r#"PREFIX wsdbm: + ), + // F4 + ( + "F4", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -213,11 +215,11 @@ fn workload_queries() -> Vec { ?v7 wsdbm:likes ?v0 . } "#, - ), - // F5 - ( - "F5", - r#"PREFIX wsdbm: + ), + // F5 + ( + "F5", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -237,11 +239,11 @@ fn workload_queries() -> Vec { ?v1 rdf:type ?v6 . } "#, - ), - // L1 - ( - "L1", - r#"PREFIX wsdbm: + ), + // L1 + ( + "L1", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -258,11 +260,11 @@ fn workload_queries() -> Vec { ?v0 wsdbm:likes ?v2 . } "#, - ), - // L2 - ( - "L2", - r#"PREFIX wsdbm: + ), + // L2 + ( + "L2", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -279,11 +281,11 @@ fn workload_queries() -> Vec { ?v2 sorg:nationality ?v1 . } "#, - ), - // L3 - ( - "L3", - r#"PREFIX wsdbm: + ), + // L3 + ( + "L3", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -299,11 +301,11 @@ fn workload_queries() -> Vec { ?v0 wsdbm:subscribes . } "#, - ), - // L4 - ( - "L4", - r#"PREFIX wsdbm: + ), + // L4 + ( + "L4", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -319,11 +321,11 @@ fn workload_queries() -> Vec { ?v0 ?v2 . } "#, - ), - // L5 - ( - "L5", - r#"PREFIX wsdbm: + ), + // L5 + ( + "L5", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -340,11 +342,11 @@ fn workload_queries() -> Vec { ?v0 sorg:nationality ?v3 . } "#, - ), - // S1 - ( - "S1", - r#"PREFIX wsdbm: + ), + // S1 + ( + "S1", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -367,11 +369,11 @@ fn workload_queries() -> Vec { ?v0 sorg:priceValidUntil ?v9 . } "#, - ), - // S2 - ( - "S2", - r#"PREFIX wsdbm: + ), + // S2 + ( + "S2", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -389,11 +391,11 @@ fn workload_queries() -> Vec { ?v0 rdf:type wsdbm:Role2 . } "#, - ), - // S3 - ( - "S3", - r#"PREFIX wsdbm: + ), + // S3 + ( + "S3", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -411,11 +413,11 @@ fn workload_queries() -> Vec { ?v0 sorg:publisher ?v4 . } "#, - ), - // S4 - ( - "S4", - r#"PREFIX wsdbm: + ), + // S4 + ( + "S4", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -433,11 +435,11 @@ fn workload_queries() -> Vec { ?v0 sorg:nationality wsdbm:Country1 . } "#, - ), - // S5 - ( - "S5", - r#"PREFIX wsdbm: + ), + // S5 + ( + "S5", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -455,11 +457,11 @@ fn workload_queries() -> Vec { ?v0 sorg:language wsdbm:Language0 . } "#, - ), - // S6 - ( - "S6", - r#"PREFIX wsdbm: + ), + // S6 + ( + "S6", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -476,11 +478,11 @@ fn workload_queries() -> Vec { ?v0 wsdbm:hasGenre . } "#, - ), - // S7 - ( - "S7", - r#"PREFIX wsdbm: + ), + // S7 + ( + "S7", + r#"PREFIX wsdbm: PREFIX sorg: PREFIX dc: PREFIX foaf: @@ -497,15 +499,12 @@ fn workload_queries() -> Vec { wsdbm:likes ?v0 . } "#, - ), + ), ] } fn queries_for_index_manager(workload: &[QuerySpec]) -> Vec { - workload - .iter() - .map(|(_, q)| q.trim().to_string()) - .collect() + workload.iter().map(|(_, q)| q.trim().to_string()).collect() } fn make_config_from_env(queries: Vec) -> (String, IndexConfig) { @@ -526,6 +525,7 @@ fn make_config_from_env(queries: Vec) -> (String, IndexConfig) { eval_interval: 1000, queries, }, + "partial_hexastore" => IndexConfig::PartialHexastore { queries }, "buckets" => IndexConfig::Buckets { queries }, other => { eprintln!( @@ -605,15 +605,86 @@ fn parse_large_ntriples_file( Ok(db) } +/// Helper function to serialize result sets into deterministic, sorted text format +fn serialize_results(results: &[Vec]) -> Vec { + let mut lines = Vec::with_capacity(results.len()); + for row in results { + // Filter out empty rows just in case the engine returns an unpopulated tuple + if row.iter().all(|s| s.is_empty()) { + continue; + } + lines.push(row.join("|")); + } + + lines.sort_unstable(); + lines +} + fn run_all_queries(db: &mut SparqlDatabase, workload: &[QuerySpec]) { const ITERATIONS: usize = 10; + let dir_path = Path::new("../benchmark_dataset"); for (name, query) in workload.iter() { println!("=============================================="); - println!("Running query {} ({} iterations)...", name, ITERATIONS); + println!("Running query {}...", name); - let mut total_time = 0.0; + // Run one validation loop to cache/verify results + if *name != "C3" { + let initial_run_start = Instant::now(); + let validation_results = execute_query_rayon_parallel2_volcano(query, db); + println!( + "Validation run completed in {:.4} seconds", + initial_run_start.elapsed().as_secs_f64() + ); + let ground_truth_file = dir_path.join(format!("ground_truth_{}.txt", name)); + let serialized_current = serialize_results(&validation_results); + + if ground_truth_file.exists() { + println!( + "[VALIDATION] Checking results against ground truth: {:?}", + ground_truth_file + ); + let file = File::open(&ground_truth_file).unwrap(); + let reader = BufReader::new(file); + + let mut cached_lines = Vec::new(); + for line in reader.lines() { + if let Ok(l) = line { + if !l.trim().is_empty() { + cached_lines.push(l); + } + } + } + + let current_set: HashSet<_> = serialized_current.into_iter().collect(); + let cached_set: HashSet<_> = cached_lines.into_iter().collect(); + + if current_set != cached_set { + let missing: Vec<_> = cached_set.difference(¤t_set).collect(); + let extra: Vec<_> = current_set.difference(&cached_set).collect(); + panic!( + "[FATAL] Query '{}' produced INVALID results!\nMissing {} lines.\nExtra {} lines.\nFirst few missing: {:?}\nFirst few extra: {:?}", + name, missing.len(), extra.len(), missing.iter().take(5).collect::>(), extra.iter().take(5).collect::>() + ); + } + println!("[✓] Validation passed for {}!", name); + } else { + println!( + "[VALIDATION] Ground truth does not exist. Caching results to {:?}", + ground_truth_file + ); + let mut file = File::create(&ground_truth_file).unwrap(); + for line in &serialized_current { + writeln!(file, "{}", line).unwrap(); + } + println!("Results cached. Note: Make sure the first run uses the 'hexastore' INDEX_TYPE!"); + } + } + + // Run the timed benchmark loop + println!("Running {} timed iterations...", ITERATIONS); + let mut total_time = 0.0; for _ in 0..ITERATIONS { let start = Instant::now(); let _ = execute_query_rayon_parallel2_volcano(query, db); @@ -639,9 +710,7 @@ fn main() { } Err(e) => { eprintln!("Error processing file '{}': {}", file_path, e); - println!( - "Make sure ../benchmark_dataset/watdiv.10M.nt exists." - ); + println!("Make sure ../benchmark_dataset/watdiv.10M.nt exists."); } } } diff --git a/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh index 76b6894..334779d 100755 --- a/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh +++ b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh @@ -16,8 +16,10 @@ TIMESTAMP=$(date +%Y%m%d_%H%M%S) RESULT_DIR="${OUTPUT_DIR}/${TIMESTAMP}" INDEX_TYPES=( + "buckets" "buckets" "pso" + "partial_hexastore" "hexastore" "dynamic_hexastore" "ops" @@ -63,4 +65,4 @@ done echo "==============================================" echo " All benchmarks complete!" echo " Results in: ${RESULT_DIR}" -echo "==============================================" +echo "==============================================" \ No newline at end of file diff --git a/kolibrie/src/sparql_database.rs b/kolibrie/src/sparql_database.rs index 8f26c74..1027e35 100644 --- a/kolibrie/src/sparql_database.rs +++ b/kolibrie/src/sparql_database.rs @@ -30,7 +30,7 @@ use shared::index_manager::TripleIndex; use shared::index_manager::{ BucketIndex, DynamicHexastoreIndex, HexastoreIndex, IndexConfig, OPSSingleIndex, OSPSingleIndex, POSSingleIndex, PSOSingleIndex, SOPSingleIndex, SPOSingleIndex, - SingleTableIndex, + SingleTableIndex, PartialHexastoreIndex }; use shared::query::FilterExpression; use shared::terms::TriplePattern; @@ -105,6 +105,7 @@ impl SparqlDatabase { // `build_all_indexes` will swap them out. IndexConfig::DynamicHexastore { .. } => Box::new(HexastoreIndex::new()), IndexConfig::Buckets { .. } => Box::new(HexastoreIndex::new()), + IndexConfig::PartialHexastore { .. } => Box::new(HexastoreIndex::new()), } } @@ -155,20 +156,12 @@ impl SparqlDatabase { values_clause.as_ref(), ); - //println!("\n[Plan Debug] === UNOPTIMIZED LOGICAL PLAN ==="); - //println!("{:#?}", logical_plan); - - // Fetch database stats & use Streamertail optimizer to find the exact physical execution plan let stats = self.get_or_build_stats(); let mut optimizer = Streamertail::with_cached_stats(stats.clone()); let optimized_plan = optimizer.find_best_plan(&logical_plan); - //println!("\n[Plan Debug] === OPTIMIZED PHYSICAL PLAN ==="); - //println!("{:#?}\n", optimized_plan); - let mut bound_vars = HashSet::new(); - // Values_clause bindings are available from the very beginning if let Some(vc) = values_clause { for var in &vc.variables { let mut v = var.to_string(); @@ -179,7 +172,6 @@ impl SparqlDatabase { } } - // Helper recursive function to walk the PHYSICAL plan execution tree fn traverse_physical( op: &PhysicalOperator, bound_vars: &mut HashSet, @@ -210,7 +202,6 @@ impl SparqlDatabase { bound_object, }); - // Variables from this scan are now bound for downstream pipeline operations if let Term::Variable(v) = s { bound_vars.insert(v.clone()); } @@ -227,8 +218,6 @@ impl SparqlDatabase { } => { let mut sorted_patterns = patterns.clone(); - // Crucial Fix: StarJoin engine executes patterns by selectivity (most constants first) - // We must sort them exactly how the engine executes them to track variables accurately. sorted_patterns.sort_by_key(|p| { let mut constants = 0; if matches!(p.0, Term::Constant(_)) { @@ -243,29 +232,61 @@ impl SparqlDatabase { std::cmp::Reverse(constants) }); - for pattern in &sorted_patterns { + let initial_bound_vars = bound_vars.clone(); + + for (i, pattern) in sorted_patterns.iter().enumerate() { let (s, p, o) = pattern; - let bound_subject = match s { + + // The pattern evaluated independently (Hash Join path fallback) + let original_bound_subject = match s { Term::Constant(_) => true, - Term::Variable(v) => bound_vars.contains(v), + Term::Variable(v) => initial_bound_vars.contains(v), }; - let bound_predicate = match p { + let original_bound_predicate = match p { Term::Constant(_) => true, - Term::Variable(v) => bound_vars.contains(v), + Term::Variable(v) => initial_bound_vars.contains(v), }; - let bound_object = match o { + let original_bound_object = match o { Term::Constant(_) => true, - Term::Variable(v) => bound_vars.contains(v), + Term::Variable(v) => initial_bound_vars.contains(v), }; out.push(PlannedAccessPattern { pattern: pattern.clone(), - bound_subject, - bound_predicate, - bound_object, + bound_subject: original_bound_subject, + bound_predicate: original_bound_predicate, + bound_object: original_bound_object, }); - // Post-scan, its variables are bound for the subsequent internal StarJoin patterns + // For i > 0, it might also be executed as a Bind Join + if i > 0 { + let accum_bound_subject = match s { + Term::Constant(_) => true, + Term::Variable(v) => bound_vars.contains(v), + }; + let accum_bound_predicate = match p { + Term::Constant(_) => true, + Term::Variable(v) => bound_vars.contains(v), + }; + let accum_bound_object = match o { + Term::Constant(_) => true, + Term::Variable(v) => bound_vars.contains(v), + }; + + if accum_bound_subject != original_bound_subject + || accum_bound_predicate != original_bound_predicate + || accum_bound_object != original_bound_object + { + out.push(PlannedAccessPattern { + pattern: pattern.clone(), + bound_subject: accum_bound_subject, + bound_predicate: accum_bound_predicate, + bound_object: accum_bound_object, + }); + } + } + + // Update accumulated bound_vars if let Term::Variable(v) = s { bound_vars.insert(v.clone()); } @@ -277,21 +298,30 @@ impl SparqlDatabase { } } } - PhysicalOperator::NestedLoopJoin { left, right } - | PhysicalOperator::ParallelJoin { left, right } => { - // Pipeline joins: Left executes first, bindings flow completely into right side - traverse_physical(left, bound_vars, out); - traverse_physical(right, bound_vars, out); + PhysicalOperator::ParallelJoin { left, right } => { + // Left executes independently + let mut left_vars = bound_vars.clone(); + traverse_physical(left, &mut left_vars, out); + + // Right can execute independently (Hash/Merge join) OR dependently (Bind join) + let mut right_vars_unbound = bound_vars.clone(); + traverse_physical(right, &mut right_vars_unbound, out); + + let mut right_vars_bound = left_vars.clone(); + traverse_physical(right, &mut right_vars_bound, out); + + bound_vars.extend(left_vars); + bound_vars.extend(right_vars_unbound); } - PhysicalOperator::HashJoin { left, right } + PhysicalOperator::NestedLoopJoin { left, right } + | PhysicalOperator::HashJoin { left, right } | PhysicalOperator::OptimizedHashJoin { left, right } => { - // Hash joins: Both sides evaluate independently using ONLY the pre-join bounds + // Both sides evaluate independently using ONLY the pre-join bounds let mut left_vars = bound_vars.clone(); let mut right_vars = bound_vars.clone(); traverse_physical(left, &mut left_vars, out); traverse_physical(right, &mut right_vars, out); - // After execution, the result contains variables from both sides bound_vars.extend(left_vars); bound_vars.extend(right_vars); } @@ -397,7 +427,14 @@ impl SparqlDatabase { // Now it's perfectly fine to borrow `self` mutably! let patterns = self.resolve_planned_access_patterns(&queries); Box::new(BucketIndex::new(patterns)) - } // Future index types go here: + } + + IndexConfig::PartialHexastore { queries } => { + let parsed_patterns = self.resolve_planned_access_patterns(&queries); + Box::new(PartialHexastoreIndex::new(parsed_patterns)) + } + + // Future index types go here: // IndexConfig::YourNewIndex { some_param, queries } => { // let patterns = self.resolve_query_patterns(&queries); // Box::new(YourNewIndex::new(patterns, some_param)) diff --git a/shared/src/index_manager/mod.rs b/shared/src/index_manager/mod.rs index 3985db8..1489ede 100644 --- a/shared/src/index_manager/mod.rs +++ b/shared/src/index_manager/mod.rs @@ -23,6 +23,8 @@ pub use spo_single::SPOSingleIndex; pub use single_table::SingleTableIndex; pub use dynamic_hexastore::DynamicHexastoreIndex; pub use buckets::BucketIndex; +pub use partial_hexastore::PartialHexastoreIndex; +pub mod partial_hexastore; pub mod hexastore; pub mod ops_single; pub mod osp_single; @@ -66,6 +68,10 @@ pub enum IndexConfig { /// Buckets Buckets { queries: Vec, + }, + + PartialHexastore { + queries: Vec, } // ── Future index types go here ── diff --git a/shared/src/index_manager/partial_hexastore.rs b/shared/src/index_manager/partial_hexastore.rs new file mode 100644 index 0000000..a95b2ce --- /dev/null +++ b/shared/src/index_manager/partial_hexastore.rs @@ -0,0 +1,325 @@ +use serde::{Serialize, Deserialize}; +use std::collections::{HashMap, HashSet}; +use crate::terms::*; +use crate::triple::Triple; +use crate::index_manager::*; +use crate::index_manager::dynamic_hexastore::{IndexType, CardinalitySnapshot}; +use crate::query::PlannedAccessPattern; + +#[derive(Debug, Clone)] +pub struct PartialHexastoreIndex { + pub spo: Option>>>, + pub pos: Option>>>, + pub osp: Option>>>, + pub pso: Option>>>, + pub ops: Option>>>, + pub sop: Option>>>, + + pub latest_card: CardinalitySnapshot, +} + +impl PartialHexastoreIndex { + pub fn new(patterns: Vec) -> Self { + let required_indexes = Self::determine_smallest_index_set(&patterns); + + let mut created_names = Vec::new(); + if required_indexes.contains(&IndexType::SPO) { created_names.push("SPO"); } + if required_indexes.contains(&IndexType::POS) { created_names.push("POS"); } + if required_indexes.contains(&IndexType::OSP) { created_names.push("OSP"); } + if required_indexes.contains(&IndexType::PSO) { created_names.push("PSO"); } + if required_indexes.contains(&IndexType::OPS) { created_names.push("OPS"); } + if required_indexes.contains(&IndexType::SOP) { created_names.push("SOP"); } + + println!("PartialHexastoreIndex initialized with indexes: {:?}", created_names); + + Self { + spo: if required_indexes.contains(&IndexType::SPO) { Some(HashMap::new()) } else { None }, + pos: if required_indexes.contains(&IndexType::POS) { Some(HashMap::new()) } else { None }, + osp: if required_indexes.contains(&IndexType::OSP) { Some(HashMap::new()) } else { None }, + pso: if required_indexes.contains(&IndexType::PSO) { Some(HashMap::new()) } else { None }, + ops: if required_indexes.contains(&IndexType::OPS) { Some(HashMap::new()) } else { None }, + sop: if required_indexes.contains(&IndexType::SOP) { Some(HashMap::new()) } else { None }, + latest_card: CardinalitySnapshot::from_stats(0, 1, 1, 1, 1), + } + } + + pub fn update_cardinalities(&mut self, card: CardinalitySnapshot) { + self.latest_card = card; + } + + /// Finds the absolute smallest set of indexes that covers all physical access patterns efficiently. + fn determine_smallest_index_set(patterns: &[PlannedAccessPattern]) -> HashSet { + if patterns.is_empty() { + return HashSet::from([IndexType::SPO]); + } + + let all_types = [ + IndexType::SPO, IndexType::SOP, IndexType::PSO, + IndexType::POS, IndexType::OSP, IndexType::OPS + ]; + + let mut valid_types_per_pattern = Vec::new(); + for planned in patterns { + let (s, p, o) = &planned.pattern; + + // A variable is considered bound if it's a true constant OR if it's pipeline-bound (from previous steps) + let bound_s = matches!(s, Term::Constant(_)) || planned.bound_subject; + let bound_p = matches!(p, Term::Constant(_)) || planned.bound_predicate; + let bound_o = matches!(o, Term::Constant(_)) || planned.bound_object; + + let mut valid = Vec::new(); + match (bound_s, bound_p, bound_o) { + (true, true, _) => { valid.push(IndexType::SPO); valid.push(IndexType::PSO); } + (true, _, true) => { valid.push(IndexType::SOP); valid.push(IndexType::OSP); } + (_, true, true) => { valid.push(IndexType::POS); valid.push(IndexType::OPS); } + (true, false, false) => { valid.push(IndexType::SPO); valid.push(IndexType::SOP); } + (false, true, false) => { valid.push(IndexType::PSO); valid.push(IndexType::POS); } + (false, false, true) => { valid.push(IndexType::OSP); valid.push(IndexType::OPS); } + (false, false, false) | (true, true, true) => { + valid.extend_from_slice(&all_types); + } + } + valid_types_per_pattern.push(valid); + } + + let mut min_size = usize::MAX; + let mut best_set = HashSet::new(); + let n = all_types.len(); + + for mask in 1..=(1 << n) - 1 { + let mut candidate_set = HashSet::new(); + for (i, &t) in all_types.iter().enumerate() { + if mask & (1 << i) != 0 { + candidate_set.insert(t); + } + } + + let covers_all = valid_types_per_pattern.iter().all(|valid| { + valid.iter().any(|vt| candidate_set.contains(vt)) + }); + + if covers_all && candidate_set.len() < min_size { + min_size = candidate_set.len(); + best_set = candidate_set.clone(); + } + } + + if best_set.is_empty() { + best_set.insert(IndexType::SPO); + } + + best_set + } + + /// Selects the best index on-query based on bound variables and root cardinality (lean index tiebreaker). + fn select_best_index(&self, s: Option, p: Option, o: Option) -> IndexType { + let mut candidates = Vec::new(); + + let add_if_available = |candidates: &mut Vec<(IndexType, f64)>, idx: IndexType, available: bool, root_card: f64| { + if available { + candidates.push((idx, root_card)); + } + }; + + // Rule 1: Bound variables first. + match (s.is_some(), p.is_some(), o.is_some()) { + (true, true, _) => { + add_if_available(&mut candidates, IndexType::SPO, self.spo.is_some(), self.latest_card.num_subjects); + add_if_available(&mut candidates, IndexType::PSO, self.pso.is_some(), self.latest_card.num_predicates); + } + (true, _, true) => { + add_if_available(&mut candidates, IndexType::SOP, self.sop.is_some(), self.latest_card.num_subjects); + add_if_available(&mut candidates, IndexType::OSP, self.osp.is_some(), self.latest_card.num_objects); + } + (_, true, true) => { + add_if_available(&mut candidates, IndexType::POS, self.pos.is_some(), self.latest_card.num_predicates); + add_if_available(&mut candidates, IndexType::OPS, self.ops.is_some(), self.latest_card.num_objects); + } + (true, false, false) => { + add_if_available(&mut candidates, IndexType::SPO, self.spo.is_some(), self.latest_card.num_subjects); + add_if_available(&mut candidates, IndexType::SOP, self.sop.is_some(), self.latest_card.num_subjects); + } + (false, true, false) => { + add_if_available(&mut candidates, IndexType::PSO, self.pso.is_some(), self.latest_card.num_predicates); + add_if_available(&mut candidates, IndexType::POS, self.pos.is_some(), self.latest_card.num_predicates); + } + (false, false, true) => { + add_if_available(&mut candidates, IndexType::OSP, self.osp.is_some(), self.latest_card.num_objects); + add_if_available(&mut candidates, IndexType::OPS, self.ops.is_some(), self.latest_card.num_objects); + } + (false, false, false) | (true, true, true) => { + add_if_available(&mut candidates, IndexType::SPO, self.spo.is_some(), self.latest_card.num_subjects); + add_if_available(&mut candidates, IndexType::SOP, self.sop.is_some(), self.latest_card.num_subjects); + add_if_available(&mut candidates, IndexType::PSO, self.pso.is_some(), self.latest_card.num_predicates); + add_if_available(&mut candidates, IndexType::POS, self.pos.is_some(), self.latest_card.num_predicates); + add_if_available(&mut candidates, IndexType::OSP, self.osp.is_some(), self.latest_card.num_objects); + add_if_available(&mut candidates, IndexType::OPS, self.ops.is_some(), self.latest_card.num_objects); + } + } + + // Fallback if none of the optimal indexes for this query shape were instantiated + if candidates.is_empty() { + add_if_available(&mut candidates, IndexType::SPO, self.spo.is_some(), self.latest_card.num_subjects); + add_if_available(&mut candidates, IndexType::SOP, self.sop.is_some(), self.latest_card.num_subjects); + add_if_available(&mut candidates, IndexType::PSO, self.pso.is_some(), self.latest_card.num_predicates); + add_if_available(&mut candidates, IndexType::POS, self.pos.is_some(), self.latest_card.num_predicates); + add_if_available(&mut candidates, IndexType::OSP, self.osp.is_some(), self.latest_card.num_objects); + add_if_available(&mut candidates, IndexType::OPS, self.ops.is_some(), self.latest_card.num_objects); + } + + // Rule 2: Tiebreaker - lean index is better (smaller root cardinality) + candidates.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal)); + candidates[0].0 + } + + #[inline] + fn query_index( + index: &HashMap>>, + q_root: Option, q_mid: Option, q_leaf: Option, + build_triple: impl Fn(u32, u32, u32) -> Triple + ) -> Vec { + let mut results = Vec::new(); + let mut scan_mid = |root_val: u32, mid_map: &HashMap>| { + if let Some(mv) = q_mid { + if let Some(leaf_set) = mid_map.get(&mv) { + if let Some(lv) = q_leaf { + if leaf_set.contains(&lv) { results.push(build_triple(root_val, mv, lv)); } + } else { + for &lv in leaf_set { results.push(build_triple(root_val, mv, lv)); } + } + } + } else { + for (&mv, leaf_set) in mid_map { + if let Some(lv) = q_leaf { + if leaf_set.contains(&lv) { results.push(build_triple(root_val, mv, lv)); } + } else { + for &lv in leaf_set { results.push(build_triple(root_val, mv, lv)); } + } + } + } + }; + + if let Some(rv) = q_root { + if let Some(mid_map) = index.get(&rv) { scan_mid(rv, mid_map); } + } else { + for (&rv, mid_map) in index { scan_mid(rv, mid_map); } + } + results + } +} + +impl TripleIndex for PartialHexastoreIndex { + fn insert(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + let mut inserted = false; + if let Some(ref mut idx) = self.spo { inserted |= idx.entry(s).or_default().entry(p).or_default().insert(o); } + if let Some(ref mut idx) = self.pos { inserted |= idx.entry(p).or_default().entry(o).or_default().insert(s); } + if let Some(ref mut idx) = self.osp { inserted |= idx.entry(o).or_default().entry(s).or_default().insert(p); } + if let Some(ref mut idx) = self.pso { inserted |= idx.entry(p).or_default().entry(s).or_default().insert(o); } + if let Some(ref mut idx) = self.ops { inserted |= idx.entry(o).or_default().entry(p).or_default().insert(s); } + if let Some(ref mut idx) = self.sop { inserted |= idx.entry(s).or_default().entry(o).or_default().insert(p); } + inserted + } + + fn delete(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + let mut deleted = false; + + let check_and_delete = |idx: &mut Option>>>, r, m, l| { + if let Some(map) = idx { + remove_from_index(map, r, m, l); + return true; + } + false + }; + + if let Some(ref mut idx) = self.spo { + let exists = idx.get(&s).and_then(|pm| pm.get(&p)).map_or(false, |os| os.contains(&o)); + if !exists { return false; } + } + + deleted |= check_and_delete(&mut self.spo, s, p, o); + check_and_delete(&mut self.pos, p, o, s); + check_and_delete(&mut self.osp, o, s, p); + check_and_delete(&mut self.pso, p, s, o); + check_and_delete(&mut self.ops, o, p, s); + check_and_delete(&mut self.sop, s, o, p); + + deleted + } + + fn query(&self, s: Option, p: Option, o: Option) -> Vec { + let best_index = self.select_best_index(s, p, o); + match best_index { + IndexType::SPO => Self::query_index(self.spo.as_ref().unwrap(), s, p, o, |s, p, o| Triple { subject: s, predicate: p, object: o }), + IndexType::SOP => Self::query_index(self.sop.as_ref().unwrap(), s, o, p, |s, o, p| Triple { subject: s, predicate: p, object: o }), + IndexType::PSO => Self::query_index(self.pso.as_ref().unwrap(), p, s, o, |p, s, o| Triple { subject: s, predicate: p, object: o }), + IndexType::POS => Self::query_index(self.pos.as_ref().unwrap(), p, o, s, |p, o, s| Triple { subject: s, predicate: p, object: o }), + IndexType::OSP => Self::query_index(self.osp.as_ref().unwrap(), o, s, p, |o, s, p| Triple { subject: s, predicate: p, object: o }), + IndexType::OPS => Self::query_index(self.ops.as_ref().unwrap(), o, p, s, |o, p, s| Triple { subject: s, predicate: p, object: o }), + } + } + + fn get_matching_triples(&self, pattern: &TriplePattern) -> Vec { + let (s, p, o) = pattern; + let sub = match s { Term::Constant(x) => Some(*x), Term::Variable(_) => None }; + let pre = match p { Term::Constant(x) => Some(*x), Term::Variable(_) => None }; + let obj = match o { Term::Constant(x) => Some(*x), Term::Variable(_) => None }; + self.query(sub, pre, obj) + } + + fn clear(&mut self) { + if let Some(idx) = &mut self.spo { idx.clear(); } + if let Some(idx) = &mut self.pos { idx.clear(); } + if let Some(idx) = &mut self.osp { idx.clear(); } + if let Some(idx) = &mut self.pso { idx.clear(); } + if let Some(idx) = &mut self.ops { idx.clear(); } + if let Some(idx) = &mut self.sop { idx.clear(); } + } + + fn clone_empty(&self) -> Box { + Box::new(Self { + spo: self.spo.as_ref().map(|_| HashMap::new()), + pos: self.pos.as_ref().map(|_| HashMap::new()), + osp: self.osp.as_ref().map(|_| HashMap::new()), + pso: self.pso.as_ref().map(|_| HashMap::new()), + ops: self.ops.as_ref().map(|_| HashMap::new()), + sop: self.sop.as_ref().map(|_| HashMap::new()), + latest_card: self.latest_card.clone(), + }) + } + + fn clone_box(&self) -> Box { + Box::new(self.clone()) + } + + fn supported_access_patterns(&self) -> AccessPatternSupport { + AccessPatternSupport { + sp: self.spo.is_some() || self.pso.is_some(), + so: self.sop.is_some() || self.osp.is_some(), + po: self.pos.is_some() || self.ops.is_some(), + ps: self.pso.is_some() || self.spo.is_some(), + os: self.osp.is_some() || self.sop.is_some(), + op: self.ops.is_some() || self.pos.is_some(), + } + } + + fn scan_sp(&self, s: u32, p: u32) -> Option<&HashSet> { + self.spo.as_ref().and_then(|idx| idx.get(&s).and_then(|m| m.get(&p))) + } + fn scan_so(&self, s: u32, o: u32) -> Option<&HashSet> { + self.sop.as_ref().and_then(|idx| idx.get(&s).and_then(|m| m.get(&o))) + } + fn scan_po(&self, p: u32, o: u32) -> Option<&HashSet> { + self.pos.as_ref().and_then(|idx| idx.get(&p).and_then(|m| m.get(&o))) + } + fn scan_ps(&self, p: u32, s: u32) -> Option<&HashSet> { + self.pso.as_ref().and_then(|idx| idx.get(&p).and_then(|m| m.get(&s))) + } + fn scan_os(&self, o: u32, s: u32) -> Option<&HashSet> { + self.osp.as_ref().and_then(|idx| idx.get(&o).and_then(|m| m.get(&s))) + } + fn scan_op(&self, o: u32, p: u32) -> Option<&HashSet> { + self.ops.as_ref().and_then(|idx| idx.get(&o).and_then(|m| m.get(&p))) + } +} \ No newline at end of file From 3e6258048aefe4cf4b5a5f32418a7f41fe306ef0 Mon Sep 17 00:00:00 2001 From: Mirovh <94124714+Mirovh@users.noreply.github.com> Date: Sun, 19 Apr 2026 04:07:32 +0200 Subject: [PATCH 18/23] Add synthetic data stream benchmark --- kolibrie/Cargo.toml | 4 + .../generate_synthetic_stream.py | 64 +++++++++ .../synthetic_stream_benchmark.rs | 121 ++++++++++++++++++ .../synthetic_stream_benchmark.sh | 45 +++++++ kolibrie/src/sparql_database.rs | 10 ++ shared/src/index_manager/buckets.rs | 18 +-- 6 files changed, 253 insertions(+), 9 deletions(-) create mode 100644 kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/generate_synthetic_stream.py create mode 100644 kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.rs create mode 100644 kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.sh diff --git a/kolibrie/Cargo.toml b/kolibrie/Cargo.toml index 159f917..7793119 100644 --- a/kolibrie/Cargo.toml +++ b/kolibrie/Cargo.toml @@ -288,3 +288,7 @@ path = "examples/real_scenario/mqtt_example.rs" [[example]] name = "mqtt_real_scenario" path = "examples/real_scenario/mqtt_real_scenario.rs" + +[[example]] +name = "synthetic_stream_benchmark" +path = "examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.rs" diff --git a/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/generate_synthetic_stream.py b/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/generate_synthetic_stream.py new file mode 100644 index 0000000..c22fd97 --- /dev/null +++ b/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/generate_synthetic_stream.py @@ -0,0 +1,64 @@ +import random +import json +import argparse + +def generate_synthetic_data(num_triples, num_subjects, num_predicates, num_objects, window_size, slide_size, output_nt, output_queries): + subjects = [f"" for i in range(num_subjects)] + predicates = [f"" for i in range(num_predicates)] + objects = [f"" for i in range(num_objects)] + + # 1. Generate the triple stream + with open(output_nt, 'w') as f: + for _ in range(num_triples): + s = random.choice(subjects) + p = random.choice(predicates) + o = random.choice(objects) + f.write(f"{s} {p} {o} .\n") + + def get_distinct(pool, k): + return random.sample(pool, k) + + # 2. Pick specific constants for queries to ensure they match generated data + p_vars = get_distinct(predicates, 8) + o_vars = get_distinct(objects, 4) + s_vars = get_distinct(subjects, 3) + p_q6 = get_distinct(predicates, 20) + + # 3. Formulate standard SPARQL SELECT queries + # NOTE: we are not using RSPQL window functions as for some reason these are insanely slow, obscuring any performance difference due to indexing strategy + # Q6 helper: matches a subject with 20 distinct properties + q6_where = " ".join([f"?s {p_q6[i]} ?o{i} ." for i in range(20)]) + q6_select = " ".join([f"?o{i}" for i in range(20)]) + + queries = { + "Q1": f"SELECT ?s WHERE {{ ?s {p_vars[0]} {o_vars[0]} . }}", + "Q2": f"SELECT ?s ?o2 ?o3 WHERE {{ ?s {p_vars[1]} ?o2 . ?s {p_vars[2]} ?o3 . }}", + "Q3": f"SELECT * WHERE {{ {s_vars[0]} {p_vars[3]} {o_vars[1]} . }}", + "Q4": f"SELECT ?v1 ?v5 WHERE {{ ?v1 {p_vars[4]} ?v2 . ?v2 {p_vars[5]} ?v3 . ?v4 {p_vars[6]} ?v3 . ?v4 {p_vars[7]} ?v5 . }}", + "Q5": f"SELECT ?p WHERE {{ {s_vars[1]} ?p {o_vars[2]} . {s_vars[2]} ?p {o_vars[3]} . }}", + "Q6": f"SELECT ?s {q6_select} WHERE {{ {q6_where} }}" + } + + with open(output_queries, 'w') as f: + json.dump(queries, f, indent=4) + + print(f"Generated {num_triples} triples in {output_nt}") + print(f"Generated {len(queries)} SPARQL queries in {output_queries}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--triples", type=int, default=100_000) + parser.add_argument("--subjects", type=int, default=5_000) + parser.add_argument("--predicates", type=int, default=50) + parser.add_argument("--objects", type=int, default=30_000) + parser.add_argument("--window_size", type=int, default=5000) + parser.add_argument("--slide_size", type=int, default=1000) + parser.add_argument("--output_nt", type=str, default="benchmark_dataset/synthetic_1M.nt") + parser.add_argument("--output_queries", type=str, default="benchmark_dataset/synthetic_queries.json") + args = parser.parse_args() + + generate_synthetic_data( + args.triples, args.subjects, args.predicates, args.objects, + args.window_size, args.slide_size, + args.output_nt, args.output_queries + ) \ No newline at end of file diff --git a/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.rs b/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.rs new file mode 100644 index 0000000..ed85017 --- /dev/null +++ b/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.rs @@ -0,0 +1,121 @@ +use kolibrie::execute_query::*; +use kolibrie::sparql_database::*; +use shared::index_manager::*; +use std::collections::HashMap; +use std::fs::File; +use std::io::{BufRead, BufReader}; +use std::time::Instant; + +fn load_queries(path: &str) -> HashMap { + let file = File::open(path).expect("Failed to open queries JSON file"); + let reader = BufReader::new(file); + serde_json::from_reader(reader).expect("Failed to parse JSON queries") +} + +fn make_config_from_env(queries: Vec) -> (String, IndexConfig) { + let index_type = std::env::var("INDEX_TYPE") + .unwrap_or_else(|_| "hexastore".to_string()) + .to_lowercase(); + + let config = match index_type.as_str() { + "hexastore" | "" => IndexConfig::Hexastore, + "spo" => IndexConfig::SPO, + "pos" => IndexConfig::POS, + "table" => IndexConfig::SingleTable, + "partial_hexastore" => IndexConfig::PartialHexastore { queries }, + "buckets" => IndexConfig::Buckets { queries }, + other => { + eprintln!("Unknown INDEX_TYPE '{}', falling back to hexastore.", other); + IndexConfig::Hexastore + } + }; + + (index_type, config) +} + +fn main() { + let window_size_str = std::env::var("WINDOW_SIZE").unwrap_or_else(|_| "50000".to_string()); + let window_size: usize = window_size_str.parse().unwrap(); + let slide_size_str = std::env::var("SLIDE_SIZE").unwrap_or_else(|_| "10000".to_string()); + let slide_size: usize = slide_size_str.parse().unwrap(); + + // 1. Load queries generated by Python + let query_map = load_queries("./benchmark_dataset/synthetic_queries.json"); + let query_strings: Vec = query_map.values().cloned().collect(); + let (index_name, config) = make_config_from_env(query_strings); + + println!("INDEX_TYPE = {}", index_name); + println!("WINDOW_SIZE = {}", window_size); + println!("SLIDE_SIZE = {}", slide_size); + + let mut db = SparqlDatabase::with_config(config); + let file = File::open("./benchmark_dataset/synthetic_1M.nt").expect("Run Python script first"); + let reader = BufReader::new(file); + + let mut all_triples = Vec::new(); + for line in reader.lines() { + if let Ok(l) = line { + if !l.trim().is_empty() { + all_triples.push(l); + } + } + } + + println!("Loaded {} triples from disk.", all_triples.len()); + let mut current_window = Vec::new(); + + let mut total_insertion_time = 0.0; + let mut total_deletion_time = 0.0; + let mut total_query_time = 0.0; + let mut window_count = 0; + let mut first = true; + + for chunk in all_triples.chunks(slide_size) { + // --- SLIDE IN --- + let insert_start = Instant::now(); + let batch_data = chunk.join("\n"); + db.parse_ntriples_and_add(&batch_data); + + // Indexes have to be built on first window + if first { + db.get_or_build_stats(); + db.build_all_indexes(); + first = false; + } + + current_window.extend_from_slice(chunk); + total_insertion_time += insert_start.elapsed().as_secs_f64(); + + // --- SLIDE OUT (Manual Window Management) --- + if current_window.len() > window_size { + let overflow = current_window.len() - window_size; + let to_remove: Vec = current_window.drain(0..overflow).collect(); + let batch_delete_data = to_remove.join("\n"); + + let delete_start = Instant::now(); + db.parse_ntriples_and_remove(&batch_delete_data); + total_deletion_time += delete_start.elapsed().as_secs_f64(); + } + + // --- EXECUTE QUERIES --- + let query_start = Instant::now(); + for (id, query) in &query_map { + let _results = execute_query_rayon_parallel2_volcano(query, &mut db); + } + total_query_time += query_start.elapsed().as_secs_f64(); + window_count += 1; + + if window_count % 5 == 0 { + println!("Window {} processed. Active Triples: {}", window_count, current_window.len()); + } + } + + println!("\n--- Final Benchmark Results ---"); + println!("Total Window Slide Operations: {}", window_count); + println!("Total Insertion Time: {:.4} s", total_insertion_time); + println!("Total Deletion Time: {:.4} s", total_deletion_time); + println!("Total Query Time (All Queries): {:.4} s", total_query_time); + + let total_time = total_insertion_time + total_deletion_time + total_query_time; + println!("Overall Throughput: {:.2} windows/sec", (window_count as f64) / total_time); +} \ No newline at end of file diff --git a/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.sh b/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.sh new file mode 100644 index 0000000..c85fd48 --- /dev/null +++ b/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash + +set -e + +# Configuration +TRIPLES=1000000 +WINDOW_SIZE=100000 +SLIDE_SIZE=50000 +SUBJECTS=300000 +PREDICATES=100 +OBJECTS=100000 + +echo "Building Kolibrie benchmark..." +cargo build --release --example synthetic_stream_benchmark + +# Setup Data Directory +mkdir -p benchmark_dataset +echo "Generating native Kolibrie synthetic dataset and rules..." +python generate_synthetic_stream.py \ + --triples $TRIPLES \ + --subjects $SUBJECTS \ + --predicates $PREDICATES \ + --objects $OBJECTS \ + --window_size $WINDOW_SIZE \ + --slide_size $SLIDE_SIZE \ + --output_nt benchmark_dataset/synthetic_1M.nt \ + --output_queries benchmark_dataset/synthetic_queries.json + +INDEXES=("hexastore" "partial_hexastore" "buckets" "pso" "spo" "pos" "table") + +for IDX in "${INDEXES[@]}"; do + echo "==========================================================" + echo "Running Stream Benchmark for Index Type: $IDX" + echo "Window Size: $WINDOW_SIZE | Slide Size: $SLIDE_SIZE" + echo "==========================================================" + + export INDEX_TYPE=$IDX + export SLIDE_SIZE=$SLIDE_SIZE + export WINDOW_SIZE=$WINDOW_SIZE + + "../../../.././target/release/examples/synthetic_stream_benchmark" + + echo "Finished $IDX" + echo "" +done \ No newline at end of file diff --git a/kolibrie/src/sparql_database.rs b/kolibrie/src/sparql_database.rs index 1027e35..6c32a5b 100644 --- a/kolibrie/src/sparql_database.rs +++ b/kolibrie/src/sparql_database.rs @@ -1043,6 +1043,16 @@ impl SparqlDatabase { } } + // Parse_ntriples and remove from DB function + pub fn parse_ntriples_and_remove(&mut self, ntriples_data: &str) { + let partial_results = self.parse_ntriples(ntriples_data); + + let encoded_triples = self.encode_triples(partial_results); + for encoded_triple in encoded_triples { + self.delete_triple(&encoded_triple); + } + } + // Parses ntriples pub fn parse_ntriples(&mut self, ntriples_data: &str) -> Vec> { let lines: Vec<&str> = ntriples_data.lines().collect(); diff --git a/shared/src/index_manager/buckets.rs b/shared/src/index_manager/buckets.rs index 4418c50..0bc0af4 100644 --- a/shared/src/index_manager/buckets.rs +++ b/shared/src/index_manager/buckets.rs @@ -445,20 +445,20 @@ pub struct BucketIndex { impl BucketIndex { pub fn new(patterns: Vec) -> Self { - println!("[Bucket Debug] --- BucketIndex Initialization ---"); - println!( - "[Bucket Debug] Requested planned patterns: {}", - patterns.len() - ); + //println!("[Bucket Debug] --- BucketIndex Initialization ---"); + //println!( + // "[Bucket Debug] Requested planned patterns: {}", + // patterns.len() + //); let buckets: Vec = patterns .into_iter() .enumerate() .map(|(i, pat)| { let b = DirectedBucket::new(pat); - println!( - "[Bucket Debug] Bucket [{}]: Pattern: {:?}, C={:?}, D={:?}, F={:?}", - i, b.pattern, b.c_positions, b.d_positions, b.f_positions - ); + //println!( + // "[Bucket Debug] Bucket [{}]: Pattern: {:?}, C={:?}, D={:?}, F={:?}", + // i, b.pattern, b.c_positions, b.d_positions, b.f_positions + //); b }) .collect(); From ca7768b7491cfca4bc81f07d8da7569fdff05a60 Mon Sep 17 00:00:00 2001 From: Mirovh Date: Tue, 21 Apr 2026 00:15:56 +0200 Subject: [PATCH 19/23] split queries synthetic stream benchmark --- .../synthetic_stream_benchmark.rs | 134 ++++++++++-------- .../synthetic_stream_benchmark.sh | 0 2 files changed, 75 insertions(+), 59 deletions(-) mode change 100644 => 100755 kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.sh diff --git a/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.rs b/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.rs index ed85017..9d4d379 100644 --- a/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.rs +++ b/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.rs @@ -21,6 +21,10 @@ fn make_config_from_env(queries: Vec) -> (String, IndexConfig) { "hexastore" | "" => IndexConfig::Hexastore, "spo" => IndexConfig::SPO, "pos" => IndexConfig::POS, + "pso" => IndexConfig::PSO, + "sop" => IndexConfig::SOP, + "ops" => IndexConfig::OPS, + "osp" => IndexConfig::OSP, "table" => IndexConfig::SingleTable, "partial_hexastore" => IndexConfig::PartialHexastore { queries }, "buckets" => IndexConfig::Buckets { queries }, @@ -41,14 +45,12 @@ fn main() { // 1. Load queries generated by Python let query_map = load_queries("./benchmark_dataset/synthetic_queries.json"); - let query_strings: Vec = query_map.values().cloned().collect(); - let (index_name, config) = make_config_from_env(query_strings); - - println!("INDEX_TYPE = {}", index_name); - println!("WINDOW_SIZE = {}", window_size); - println!("SLIDE_SIZE = {}", slide_size); + + // Sort queries to run them in a consistent order (Q1, Q2, etc.) + let mut query_keys: Vec<_> = query_map.keys().cloned().collect(); + query_keys.sort(); - let mut db = SparqlDatabase::with_config(config); + // 2. Load all triples into memory once to avoid disk I/O bottlenecks during the benchmark let file = File::open("./benchmark_dataset/synthetic_1M.nt").expect("Run Python script first"); let reader = BufReader::new(file); @@ -60,62 +62,76 @@ fn main() { } } } - println!("Loaded {} triples from disk.", all_triples.len()); - let mut current_window = Vec::new(); - - let mut total_insertion_time = 0.0; - let mut total_deletion_time = 0.0; - let mut total_query_time = 0.0; - let mut window_count = 0; - let mut first = true; - - for chunk in all_triples.chunks(slide_size) { - // --- SLIDE IN --- - let insert_start = Instant::now(); - let batch_data = chunk.join("\n"); - db.parse_ntriples_and_add(&batch_data); + + // 3. Iterate over each query individually + for query_id in query_keys { + let query_string = query_map.get(&query_id).unwrap().clone(); - // Indexes have to be built on first window - if first { - db.get_or_build_stats(); - db.build_all_indexes(); - first = false; - } + // Pass ONLY the current query to the config so partial indexes are built specifically for it + let (index_name, config) = make_config_from_env(vec![query_string.clone()]); - current_window.extend_from_slice(chunk); - total_insertion_time += insert_start.elapsed().as_secs_f64(); - - // --- SLIDE OUT (Manual Window Management) --- - if current_window.len() > window_size { - let overflow = current_window.len() - window_size; - let to_remove: Vec = current_window.drain(0..overflow).collect(); - let batch_delete_data = to_remove.join("\n"); - - let delete_start = Instant::now(); - db.parse_ntriples_and_remove(&batch_delete_data); - total_deletion_time += delete_start.elapsed().as_secs_f64(); - } + println!("\n=========================================================="); + println!("INDEX_TYPE = {} | QUERY = {}", index_name, query_id); + println!("WINDOW_SIZE = {} | SLIDE_SIZE = {}", window_size, slide_size); + println!("=========================================================="); + + // Initialize a fresh database for this query run + let mut db = SparqlDatabase::with_config(config); + let mut current_window = Vec::new(); + + let mut total_insertion_time = 0.0; + let mut total_deletion_time = 0.0; + let mut total_query_time = 0.0; + let mut window_count = 0; + let mut first = true; + + for chunk in all_triples.chunks(slide_size) { + // --- SLIDE IN --- + let insert_start = Instant::now(); + let batch_data = chunk.join("\n"); + db.parse_ntriples_and_add(&batch_data); + + // Indexes have to be built on first window + if first { + db.get_or_build_stats(); + db.build_all_indexes(); + first = false; + } + + current_window.extend_from_slice(chunk); + total_insertion_time += insert_start.elapsed().as_secs_f64(); + + // --- SLIDE OUT (Manual Window Management) --- + if current_window.len() > window_size { + let overflow = current_window.len() - window_size; + let to_remove: Vec = current_window.drain(0..overflow).collect(); + let batch_delete_data = to_remove.join("\n"); + + let delete_start = Instant::now(); + db.parse_ntriples_and_remove(&batch_delete_data); + total_deletion_time += delete_start.elapsed().as_secs_f64(); + } - // --- EXECUTE QUERIES --- - let query_start = Instant::now(); - for (id, query) in &query_map { - let _results = execute_query_rayon_parallel2_volcano(query, &mut db); + // --- EXECUTE ONLY CURRENT QUERY --- + let query_start = Instant::now(); + let _results = execute_query_rayon_parallel2_volcano(&query_string, &mut db); + total_query_time += query_start.elapsed().as_secs_f64(); + + window_count += 1; + + if window_count % 5 == 0 { + println!(" Window {} processed. Active Triples: {}", window_count, current_window.len()); + } } - total_query_time += query_start.elapsed().as_secs_f64(); - window_count += 1; + + println!("\n--- Final Benchmark Results: {} on {} ---", query_id, index_name); + println!("Total Window Slide Operations: {}", window_count); + println!("Total Insertion Time: {:.4} s", total_insertion_time); + println!("Total Deletion Time: {:.4} s", total_deletion_time); + println!("Total Query Time: {:.4} s", total_query_time); - if window_count % 5 == 0 { - println!("Window {} processed. Active Triples: {}", window_count, current_window.len()); - } + let total_time = total_insertion_time + total_deletion_time + total_query_time; + println!("Overall Throughput: {:.2} windows/sec", (window_count as f64) / total_time); } - - println!("\n--- Final Benchmark Results ---"); - println!("Total Window Slide Operations: {}", window_count); - println!("Total Insertion Time: {:.4} s", total_insertion_time); - println!("Total Deletion Time: {:.4} s", total_deletion_time); - println!("Total Query Time (All Queries): {:.4} s", total_query_time); - - let total_time = total_insertion_time + total_deletion_time + total_query_time; - println!("Overall Throughput: {:.2} windows/sec", (window_count as f64) / total_time); -} \ No newline at end of file +} diff --git a/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.sh b/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.sh old mode 100644 new mode 100755 From 20660c535d24cb2883d4815d5f512a7e3f2c67e9 Mon Sep 17 00:00:00 2001 From: Mirovh Date: Tue, 21 Apr 2026 01:23:29 +0200 Subject: [PATCH 20/23] Add more queries to synthetic stream benchmark --- .../generate_synthetic_stream.py | 12 +++--- .../synthetic_stream_benchmark.rs | 39 ++++++++++++++++--- .../synthetic_stream_benchmark.sh | 10 +++-- .../execution/engine.rs | 2 +- shared/src/index_manager/buckets.rs | 18 ++++----- 5 files changed, 57 insertions(+), 24 deletions(-) diff --git a/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/generate_synthetic_stream.py b/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/generate_synthetic_stream.py index c22fd97..ded5940 100644 --- a/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/generate_synthetic_stream.py +++ b/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/generate_synthetic_stream.py @@ -19,9 +19,9 @@ def get_distinct(pool, k): return random.sample(pool, k) # 2. Pick specific constants for queries to ensure they match generated data - p_vars = get_distinct(predicates, 8) - o_vars = get_distinct(objects, 4) - s_vars = get_distinct(subjects, 3) + p_vars = get_distinct(predicates, 9) + o_vars = get_distinct(objects, 5) + s_vars = get_distinct(subjects, 4) p_q6 = get_distinct(predicates, 20) # 3. Formulate standard SPARQL SELECT queries @@ -36,7 +36,9 @@ def get_distinct(pool, k): "Q3": f"SELECT * WHERE {{ {s_vars[0]} {p_vars[3]} {o_vars[1]} . }}", "Q4": f"SELECT ?v1 ?v5 WHERE {{ ?v1 {p_vars[4]} ?v2 . ?v2 {p_vars[5]} ?v3 . ?v4 {p_vars[6]} ?v3 . ?v4 {p_vars[7]} ?v5 . }}", "Q5": f"SELECT ?p WHERE {{ {s_vars[1]} ?p {o_vars[2]} . {s_vars[2]} ?p {o_vars[3]} . }}", - "Q6": f"SELECT ?s {q6_select} WHERE {{ {q6_where} }}" + "Q6": f"SELECT ?s {q6_select} WHERE {{ {q6_where} }}", + "Q7": f"SELECT ?s4 ?p5 ?o6 WHERE {{ {s_vars[3]} ?p1 ?o1 . ?s2 {p_vars[8]} ?o2 . ?s3 ?p3 {o_vars[4]} . ?s4 ?p1 ?o4 . ?s5 ?p5 ?o2 . ?s3 ?p6 ?o6 . ?s4 ?p5 ?o6 }}", + "Q8": f"SELECT * WHERE {{ ?n1 ?e12 ?n2 . ?n2 ?e23 ?n3 . ?n3 ?e34 ?n4 . ?n4 ?e41 ?n1 . ?n1 ?e13 ?n3 . ?n3 ?e31 ?n1 . ?n2 ?e24 ?n4 . ?n4 ?e42 ?n2 . ?n2 ?e21 ?n1 . ?n4 ?e43 ?n3 . }}" } with open(output_queries, 'w') as f: @@ -61,4 +63,4 @@ def get_distinct(pool, k): args.triples, args.subjects, args.predicates, args.objects, args.window_size, args.slide_size, args.output_nt, args.output_queries - ) \ No newline at end of file + ) diff --git a/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.rs b/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.rs index 9d4d379..7382ea8 100644 --- a/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.rs +++ b/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.rs @@ -2,8 +2,8 @@ use kolibrie::execute_query::*; use kolibrie::sparql_database::*; use shared::index_manager::*; use std::collections::HashMap; -use std::fs::File; -use std::io::{BufRead, BufReader}; +use std::fs::{File, OpenOptions}; +use std::io::{BufRead, BufReader, Write, Seek, SeekFrom}; use std::time::Instant; fn load_queries(path: &str) -> HashMap { @@ -43,6 +43,20 @@ fn main() { let slide_size_str = std::env::var("SLIDE_SIZE").unwrap_or_else(|_| "10000".to_string()); let slide_size: usize = slide_size_str.parse().unwrap(); + // Setup CSV file for output + let csv_path = "./benchmark_dataset/benchmark_results.csv"; + let mut csv_file = OpenOptions::new() + .create(true) + .append(true) + .open(csv_path) + .expect("Failed to open CSV file"); + + // Write CSV header if the file is new/empty + if csv_file.metadata().expect("Failed to get metadata").len() == 0 { + writeln!(csv_file, "Index_Type,Query_ID,Window_Size,Slide_Size,Total_Windows,Insertion_Time_s,Deletion_Time_s,Query_Time_s,Throughput_win_per_s") + .expect("Failed to write CSV header"); + } + // 1. Load queries generated by Python let query_map = load_queries("./benchmark_dataset/synthetic_queries.json"); @@ -66,11 +80,16 @@ fn main() { // 3. Iterate over each query individually for query_id in query_keys { + let query_string = query_map.get(&query_id).unwrap().clone(); // Pass ONLY the current query to the config so partial indexes are built specifically for it let (index_name, config) = make_config_from_env(vec![query_string.clone()]); - + + if (query_id == "Q8" && index_name != "hexastore" && index_name != "partial_hexastore" && index_name != "buckets") { + continue; + } + println!("\n=========================================================="); println!("INDEX_TYPE = {} | QUERY = {}", index_name, query_id); println!("WINDOW_SIZE = {} | SLIDE_SIZE = {}", window_size, slide_size); @@ -125,13 +144,21 @@ fn main() { } } + let total_time = total_insertion_time + total_deletion_time + total_query_time; + let throughput = (window_count as f64) / total_time; + println!("\n--- Final Benchmark Results: {} on {} ---", query_id, index_name); println!("Total Window Slide Operations: {}", window_count); println!("Total Insertion Time: {:.4} s", total_insertion_time); println!("Total Deletion Time: {:.4} s", total_deletion_time); println!("Total Query Time: {:.4} s", total_query_time); - - let total_time = total_insertion_time + total_deletion_time + total_query_time; - println!("Overall Throughput: {:.2} windows/sec", (window_count as f64) / total_time); + println!("Overall Throughput: {:.2} windows/sec", throughput); + + // Append to CSV + writeln!( + csv_file, + "{},{},{},{},{},{:.4},{:.4},{:.4},{:.2}", + index_name, query_id, window_size, slide_size, window_count, total_insertion_time, total_deletion_time, total_query_time, throughput + ).expect("Failed to write to CSV file"); } } diff --git a/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.sh b/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.sh index c85fd48..d7e448f 100755 --- a/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.sh +++ b/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!//usr/bin/env bash set -e @@ -15,6 +15,10 @@ cargo build --release --example synthetic_stream_benchmark # Setup Data Directory mkdir -p benchmark_dataset + +# Clear previous CSV results +rm -f benchmark_dataset/benchmark_results.csv + echo "Generating native Kolibrie synthetic dataset and rules..." python generate_synthetic_stream.py \ --triples $TRIPLES \ @@ -26,7 +30,7 @@ python generate_synthetic_stream.py \ --output_nt benchmark_dataset/synthetic_1M.nt \ --output_queries benchmark_dataset/synthetic_queries.json -INDEXES=("hexastore" "partial_hexastore" "buckets" "pso" "spo" "pos" "table") +INDEXES=("buckets" "partial_hexastore" "hexastore" "pso" "spo" "pos" "table") for IDX in "${INDEXES[@]}"; do echo "==========================================================" @@ -42,4 +46,4 @@ for IDX in "${INDEXES[@]}"; do echo "Finished $IDX" echo "" -done \ No newline at end of file +done diff --git a/kolibrie/src/streamertail_optimizer/execution/engine.rs b/kolibrie/src/streamertail_optimizer/execution/engine.rs index fbf54c5..017d624 100644 --- a/kolibrie/src/streamertail_optimizer/execution/engine.rs +++ b/kolibrie/src/streamertail_optimizer/execution/engine.rs @@ -1037,7 +1037,7 @@ impl ExecutionEngine { // FULLY UNBOUND (0 constants, 3 variables) - table scan is appropriate (Term::Variable(s), Term::Variable(p), Term::Variable(o)) => { - println!("INFO: Full table scan for fully unbound pattern (? {}, ?{}, ?{})", s, p, o); + //println!("INFO: Full table scan for fully unbound pattern (? {}, ?{}, ?{})", s, p, o); Self::execute_table_scan_with_ids(database, pattern) } } diff --git a/shared/src/index_manager/buckets.rs b/shared/src/index_manager/buckets.rs index 0bc0af4..4418c50 100644 --- a/shared/src/index_manager/buckets.rs +++ b/shared/src/index_manager/buckets.rs @@ -445,20 +445,20 @@ pub struct BucketIndex { impl BucketIndex { pub fn new(patterns: Vec) -> Self { - //println!("[Bucket Debug] --- BucketIndex Initialization ---"); - //println!( - // "[Bucket Debug] Requested planned patterns: {}", - // patterns.len() - //); + println!("[Bucket Debug] --- BucketIndex Initialization ---"); + println!( + "[Bucket Debug] Requested planned patterns: {}", + patterns.len() + ); let buckets: Vec = patterns .into_iter() .enumerate() .map(|(i, pat)| { let b = DirectedBucket::new(pat); - //println!( - // "[Bucket Debug] Bucket [{}]: Pattern: {:?}, C={:?}, D={:?}, F={:?}", - // i, b.pattern, b.c_positions, b.d_positions, b.f_positions - //); + println!( + "[Bucket Debug] Bucket [{}]: Pattern: {:?}, C={:?}, D={:?}, F={:?}", + i, b.pattern, b.c_positions, b.d_positions, b.f_positions + ); b }) .collect(); From 689583409adfdfea9b0d473239931dc9d9cf7099 Mon Sep 17 00:00:00 2001 From: Mirovh Date: Fri, 24 Apr 2026 17:32:59 +0200 Subject: [PATCH 21/23] changes --- .../generate_synthetic_stream.py | 39 ++++++++++++- .../synthetic_stream_benchmark.rs | 2 +- .../synthetic_stream_benchmark.sh | 2 +- shared/src/index_manager/buckets.rs | 58 ++++++++++++++----- 4 files changed, 82 insertions(+), 19 deletions(-) diff --git a/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/generate_synthetic_stream.py b/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/generate_synthetic_stream.py index ded5940..4ab6924 100644 --- a/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/generate_synthetic_stream.py +++ b/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/generate_synthetic_stream.py @@ -30,6 +30,41 @@ def get_distinct(pool, k): q6_where = " ".join([f"?s {p_q6[i]} ?o{i} ." for i in range(20)]) q6_select = " ".join([f"?o{i}" for i in range(20)]) + # Generate 500 distinct predicates to force BucketIndex to create 500 buckets + p_q9 = get_distinct(predicates, 500) + + q9_where = " ".join([f"?s {p_q9[i]} ?o{i} ." for i in range(500)]) + q9_select = " ".join([f"?o{i}" for i in range(500)]) + + p_q10 = get_distinct(predicates, 500) + s_q10 = get_distinct(subjects, 5) + o_q10 = get_distinct(objects, 5) + + # 2. Build the Q10 WHERE clause explicitly + q10_where_clauses = [ + # --- SAVE TABLE --- + # 1. S _ _ : Highly selective subject bound (~20 matches). Table does 1 scan. + f"{s_q10[0]} ?p_start ?v_core .", + + # 2. _ P O : The 20 matches join against this. Probability of a match is near zero. + # The join collapses immediately. + f"?v_core {p_q10[0]} {o_q10[0]} .", + + # --- KILL PARTIAL HEXASTORE --- + # Force the dynamic indexer to build all remaining permutations statically + f"{s_q10[1]} {p_q10[1]} ?v_core .", # S P _ + f"{s_q10[2]} ?p_mid {o_q10[1]} .", # S _ O + f"?v_core ?p_end {o_q10[2]} .", # _ _ O + f"{s_q10[3]} {p_q10[2]} {o_q10[3]} .", # S P O + ] + + # --- KILL BUCKETS --- + # Add 490+ distinct _ P _ patterns to force massive bucket allocation + for i in range(3, 500): + q10_where_clauses.append(f"?v_core {p_q10[i]} ?v_ext_{i} .") + + q10_where = " ".join(q10_where_clauses) + queries = { "Q1": f"SELECT ?s WHERE {{ ?s {p_vars[0]} {o_vars[0]} . }}", "Q2": f"SELECT ?s ?o2 ?o3 WHERE {{ ?s {p_vars[1]} ?o2 . ?s {p_vars[2]} ?o3 . }}", @@ -38,7 +73,9 @@ def get_distinct(pool, k): "Q5": f"SELECT ?p WHERE {{ {s_vars[1]} ?p {o_vars[2]} . {s_vars[2]} ?p {o_vars[3]} . }}", "Q6": f"SELECT ?s {q6_select} WHERE {{ {q6_where} }}", "Q7": f"SELECT ?s4 ?p5 ?o6 WHERE {{ {s_vars[3]} ?p1 ?o1 . ?s2 {p_vars[8]} ?o2 . ?s3 ?p3 {o_vars[4]} . ?s4 ?p1 ?o4 . ?s5 ?p5 ?o2 . ?s3 ?p6 ?o6 . ?s4 ?p5 ?o6 }}", - "Q8": f"SELECT * WHERE {{ ?n1 ?e12 ?n2 . ?n2 ?e23 ?n3 . ?n3 ?e34 ?n4 . ?n4 ?e41 ?n1 . ?n1 ?e13 ?n3 . ?n3 ?e31 ?n1 . ?n2 ?e24 ?n4 . ?n4 ?e42 ?n2 . ?n2 ?e21 ?n1 . ?n4 ?e43 ?n3 . }}" + "Q8": f"SELECT * WHERE {{ ?n1 ?e12 ?n2 . ?n2 ?e23 ?n3 . ?n3 ?e34 ?n4 . ?n4 ?e41 ?n1 . ?n1 ?e13 ?n3 . ?n3 ?e31 ?n1 . ?n2 ?e24 ?n4 . ?n4 ?e42 ?n2 . ?n2 ?e21 ?n1 . ?n4 ?e43 ?n3 . }}", + "Q9": f"SELECT ?s {q9_select} WHERE {{ {q9_where} }}", + "Q10": f"SELECT * WHERE {{ {q10_where} }}" } with open(output_queries, 'w') as f: diff --git a/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.rs b/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.rs index 7382ea8..4c54e7d 100644 --- a/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.rs +++ b/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.rs @@ -86,7 +86,7 @@ fn main() { // Pass ONLY the current query to the config so partial indexes are built specifically for it let (index_name, config) = make_config_from_env(vec![query_string.clone()]); - if (query_id == "Q8" && index_name != "hexastore" && index_name != "partial_hexastore" && index_name != "buckets") { + if ((query_id == "Q8" || query_id == "Q9") && index_name != "hexastore" && index_name != "partial_hexastore" && index_name != "buckets") { continue; } diff --git a/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.sh b/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.sh index d7e448f..d742ff0 100755 --- a/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.sh +++ b/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.sh @@ -7,7 +7,7 @@ TRIPLES=1000000 WINDOW_SIZE=100000 SLIDE_SIZE=50000 SUBJECTS=300000 -PREDICATES=100 +PREDICATES=1000 OBJECTS=100000 echo "Building Kolibrie benchmark..." diff --git a/shared/src/index_manager/buckets.rs b/shared/src/index_manager/buckets.rs index 4418c50..b93055f 100644 --- a/shared/src/index_manager/buckets.rs +++ b/shared/src/index_manager/buckets.rs @@ -446,25 +446,51 @@ pub struct BucketIndex { impl BucketIndex { pub fn new(patterns: Vec) -> Self { println!("[Bucket Debug] --- BucketIndex Initialization ---"); - println!( - "[Bucket Debug] Requested planned patterns: {}", - patterns.len() - ); - let buckets: Vec = patterns - .into_iter() - .enumerate() - .map(|(i, pat)| { - let b = DirectedBucket::new(pat); + println!("[Bucket Debug] Requested planned patterns: {}", patterns.len()); + + let mut seen_configs = HashSet::new(); + let mut unique_buckets = Vec::new(); + + for planned in patterns { + // 1. Determine positions exactly as DirectedBucket::new would + let mut c_pos_vals = Vec::new(); + let mut d_positions = Vec::new(); + + let mut check = |term: &Term, is_bound: bool, pos: usize| match term { + Term::Constant(c) => c_pos_vals.push((pos, *c)), + Term::Variable(_) => { + if is_bound { + d_positions.push(pos); + } + } + }; + + check(&planned.pattern.0, planned.bound_subject, 0); + check(&planned.pattern.1, planned.bound_predicate, 1); + check(&planned.pattern.2, planned.bound_object, 2); + + // 2. Create a unique key for this storage configuration + // We sort D because the internal logic of DirectedBucket + // usually processes them in positional order. + d_positions.sort_unstable(); + let config_key = (c_pos_vals, d_positions); + + // 3. Only create the bucket if we haven't seen this storage config yet + if seen_configs.insert(config_key) { + let b = DirectedBucket::new(planned); println!( - "[Bucket Debug] Bucket [{}]: Pattern: {:?}, C={:?}, D={:?}, F={:?}", - i, b.pattern, b.c_positions, b.d_positions, b.f_positions + "[Bucket Debug] Created Bucket [{}]: C={:?}, D={:?}, F={:?}", + unique_buckets.len(), b.c_positions, b.d_positions, b.f_positions ); - b - }) - .collect(); - Self { buckets } - } + unique_buckets.push(b); + } else { + println!("[Bucket Debug] Pruned duplicate pattern: {:?}", planned.pattern); + } + } + println!("[Bucket Debug] Final unique bucket count: {}", unique_buckets.len()); + Self { buckets: unique_buckets } + } fn bucket_covers_query( bucket_pat: &TriplePattern, q_s: Option, From 4446fc6b3ca1c9000219a43873fce70ce081f4f3 Mon Sep 17 00:00:00 2001 From: Mirovh <94124714+Mirovh@users.noreply.github.com> Date: Sun, 17 May 2026 08:16:58 +0200 Subject: [PATCH 22/23] remove old index from WatDiv helper script --- .../sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh index 334779d..a45c408 100755 --- a/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh +++ b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh @@ -21,7 +21,6 @@ INDEX_TYPES=( "pso" "partial_hexastore" "hexastore" - "dynamic_hexastore" "ops" "osp" "pos" From 071ed1957bf89ca69c9d883079b8c43fcc24ef14 Mon Sep 17 00:00:00 2001 From: Mirovh <94124714+Mirovh@users.noreply.github.com> Date: Sun, 17 May 2026 08:28:53 +0200 Subject: [PATCH 23/23] patch gitignore --- .gitignore | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index dcf087f..90e0a0a 100644 --- a/.gitignore +++ b/.gitignore @@ -91,9 +91,8 @@ kolibrie/src/cuda/cudajoin.lib python/.venv/ # Some other directories -benchmark_dataset/ -kolibrie/examples/sparql_syntax/n_triples_data/benchmark_results/ -!kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh +**/benchmark_results/ +**/benchmark_dataset/ # IntelliJ .idea/