diff --git a/.github/workflows/ci.yml.unused b/.github/workflows/ci.yml.unused new file mode 100644 index 0000000..38d7ec1 --- /dev/null +++ b/.github/workflows/ci.yml.unused @@ -0,0 +1,52 @@ +name: CI + +# This workflow run tests and build for each push + +on: + push: + branches: + - main + - master + +jobs: + + all_duplicati_restore: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + + - name: Update local toolchain + run: | + rustup update + rustup component add clippy + rustup target add x86_64-pc-windows-gnu + sudo apt-get install -y gcc-mingw-w64-x86-64 + + - name: Toolchain info + run: | + cargo --version --verbose + rustc --version + cargo clippy --version + + - name: Lint + run: | + cargo fmt -- --check + cargo clippy -- -D warnings + + - name: Test + run: | + cargo check + cargo test --all + + - name: Build + run: | + cargo build --release + cargo build --release --target x86_64-pc-windows-gnu + + - name: Archive production artifacts + uses: actions/upload-artifact@v3 + with: + name: target binaries + path: | + target/release/rust-duplicati-restore + target/x86_64-pc-windows-gnu/release/rust-duplicati-restore.exe \ No newline at end of file diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml new file mode 100644 index 0000000..e942013 --- /dev/null +++ b/.github/workflows/rust.yml @@ -0,0 +1,146 @@ +name: Rust + +on: + push: +# pull_request: + +env: + CARGO_TERM_COLOR: always + +defaults: + run: + # necessary for windows + shell: bash + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Cargo cache + uses: actions/cache@v2 + with: + path: | + ~/.cargo/registry + ./target + key: test-cargo-registry + - name: List + run: find ./ + - name: Run tests + run: cargo test --verbose + + build: + strategy: + fail-fast: false + matrix: + # a list of all the targets + include: + - TARGET: x86_64-unknown-linux-gnu # tested in a debian container on a mac + OS: ubuntu-latest + # - TARGET: x86_64-unknown-linux-musl # test in an alpine container on a mac + # OS: ubuntu-latest + - TARGET: aarch64-unknown-linux-gnu # tested on aws t4g.nano + OS: ubuntu-latest + # - TARGET: aarch64-unknown-linux-musl # tested on aws t4g.nano in alpine container + # OS: ubuntu-latest + - TARGET: armv7-unknown-linux-gnueabihf # raspberry pi 2-3-4, not tested + OS: ubuntu-latest + # - TARGET: armv7-unknown-linux-musleabihf # raspberry pi 2-3-4, not tested + # OS: ubuntu-latest + - TARGET: arm-unknown-linux-gnueabihf # raspberry pi 0-1, not tested + OS: ubuntu-latest + # - TARGET: arm-unknown-linux-musleabihf # raspberry pi 0-1, not tested + # OS: ubuntu-latest + - TARGET: x86_64-apple-darwin # tested on a mac, is not properly signed so there are security warnings + OS: macos-latest + - TARGET: x86_64-pc-windows-msvc # tested on a windows machine + OS: windows-latest + needs: test + runs-on: ${{ matrix.OS }} + env: + NAME: rust-duplicati-restore + TARGET: ${{ matrix.TARGET }} + OS: ${{ matrix.OS }} + steps: + - uses: actions/checkout@v2 + - name: Cargo cache + uses: actions/cache@v2 + with: + path: | + ~/.cargo/registry + ./target + key: build-cargo-registry-${{matrix.TARGET}} + - name: List + run: find ./ + - name: Install and configure dependencies + run: | + # dependencies are only needed on ubuntu as that's the only place where + # we make cross-compilation + if [[ $OS =~ ^ubuntu.*$ ]]; then + sudo apt-get install -qq crossbuild-essential-arm64 crossbuild-essential-armhf musl musl-tools musl-dev + fi + + # some additional configuration for cross-compilation on linux + cat >>~/.cargo/config <"] -edition = "2018" +version = "0.0.6" +authors = ["Nathan McCarty ", "7ERr0r"] +edition = "2021" + +[profile.dev] +opt-level = 2 + +[features] +dhat-heap = ["dep:dhat"] # if you are doing heap profiling +unqlite = ["dep:unqlite"] # TODO maybe for veeery large backups [dependencies] -zip = "0.5.2" +zip = { version = "*", git = "https://github.com/7ERr0r/zip-duplicati", rev = "77f115763e7d1e686273589e7b26f4efd3f5bf38" } chrono = "0.4.0" -base64 = "0.10.1" +base64 = "0.21" pbr = "1.0.1" -serde = {version = "1.0", features = ["derive"]} +serde = { version = "1.0", features = ["derive"] } serde_json = "1.0.39" rayon = "1.0" -num_cpus = "1.10.0" -unqlite = "1.4.1" -indicatif = "0.11.0" \ No newline at end of file +#num_cpus = "1.10.0" +unqlite = { version = "1.5", optional = true } +indicatif = "0.17" +clap = { version = "4.0.32", features = ["derive"] } +eyre = "0.6.8" +serde_path_to_error = "0.1" +smallvec = "*" +sha2 = "0.10.6" +crossbeam-channel = "0.5.6" +dhat = { version = "0.3.2", optional = true } diff --git a/README.md b/README.md index cd9a29c..015147c 100644 --- a/README.md +++ b/README.md @@ -1,40 +1,30 @@ # Rust Duplicati Restore -Rust program for duplicati disaster recovery quick, fast, and in a hurry. -Uses rayon to process files across many threads, to maximze restore speed. +[Fast](https://programming-language-benchmarks.vercel.app/rust-vs-csharp) [Duplicati](https://github.com/duplicati/duplicati) [disaster](https://duplicati.readthedocs.io/en/stable/08-disaster-recovery/) [recovery](https://github.com/duplicati/duplicati/tree/master/Duplicati/CommandLine/RecoveryTool). +Processes files across many threads, to maximze restore speed. -## Getting Started +## Run -Simply run the rust-duplicati-restore from the commandline. -It doesn't accept any flags and will prompt you for all information. - -### Prerequisites - -You must have sqlite3 installed on your system for this program to function. - - -### Installing - -Simply run ``` -cargo build --release +Usage: cargo run -- --backup-dir --restore-dir ``` -Or download the latest binary from the artifacts +[More flags here](https://github.com/7ERr0r/duplicati-restore-rs/blob/master/src/flags.rs#L5) + +Or download the latest [binary from releases](https://github.com/7ERr0r/duplicati-restore-rs/releases) ## Limitations -* Currently does not verify restored files -* Does not yet support encrypted backups, I reccomend combining aescrypt with gnu parallel for decryption -* Does not support remote repositories yet, I reccomend using rclone to pull donw a local copy +* Does not yet support [encrypted backups](https://github.com/duplicati/duplicati/issues/2927) - `.aes` files +* Does not support [remote repositories](https://crates.io/crates/remotefs) yet, I reccomend using rclone to pull down a local copy ## Built With -* [Rust](https://www.rust-lang.org/) -* [SQLite](https://www.sqlite.org) -* [Rayon](https://github.com/rayon-rs/rayon) -* And may more, see Cargo.toml for full list +* [Rust](https://www.rust-lang.org/) +* [`rayon` crate](https://github.com/rayon-rs/rayon) +* [Modified `zip` crate](https://github.com/7ERr0r/zip-duplicati) +* And many more, see [Cargo.toml](Cargo.toml) for full list ## License @@ -44,3 +34,7 @@ This project is licensed under the MIT License - see the [LICENSE.md](LICENSE.md * Ben Fisher - His python script included in the Duplicati reposistory inspired this project, and this project was roughly based on it. + +* Nathan McCarty - Created [Rust-Duplicati-Restore](https://github.com/nmccarty/Rust-Duplicati-Restore) itself + +* 7ERr0r - Optimized ZIP reader. Added sha2 verification. diff --git a/src/blockhash.rs b/src/blockhash.rs new file mode 100644 index 0000000..dc78c30 --- /dev/null +++ b/src/blockhash.rs @@ -0,0 +1,72 @@ +use std::{cell::RefCell, fmt::Display}; + +use base64::engine::general_purpose; +use smallvec::SmallVec; + +use crate::hexdisplay::HexDisplayBytes; +thread_local! { + pub static BASE64_DECODE_BUF: RefCell> = RefCell::new(Vec::with_capacity(64)); + +} + +#[derive(Debug, Clone, Eq, PartialEq, PartialOrd, Ord)] +pub struct BlockIdHash { + pub hash: SmallVec<[u8; 32]>, +} + +impl Display for BlockIdHash { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", HexDisplayBytes(self.hash.as_slice())) + } +} + +impl BlockIdHash { + pub fn from_bytes(b: &[u8]) -> Option { + if b.len() != 32 { + return None; + } + Some(BlockIdHash { + hash: SmallVec::from_slice(b), + }) + } + + pub fn from_base64(block_id_str: &str) -> Option { + Self::from_base64_config(block_id_str, general_purpose::STANDARD) + } + #[allow(unused)] + pub fn from_base64_urlsafe(block_id_str: &str) -> Option { + Self::from_base64_config(block_id_str, general_purpose::URL_SAFE) + } + + pub fn from_base64_config( + block_id_str: &str, + engine: E, + ) -> Option { + BASE64_DECODE_BUF.with(|b| -> Option { + let buffer: &mut Vec = &mut b.borrow_mut(); + assert!(block_id_str.len() < buffer.capacity()); + engine.decode_vec(block_id_str, buffer).ok()?; + let hash = BlockIdHash { + hash: SmallVec::from_slice(buffer), + }; + buffer.clear(); + Some(hash) + }) + } + + #[allow(unused)] + pub fn as_base64<'a>(&self, buf: &'a mut [u8]) -> &'a str { + self.as_base64_config(general_purpose::STANDARD, buf) + } + pub fn as_base64_urlsafe<'a>(&self, buf: &'a mut [u8]) -> &'a str { + self.as_base64_config(general_purpose::URL_SAFE, buf) + } + pub fn as_base64_config<'a, E: base64::Engine>(&self, engine: E, buf: &'a mut [u8]) -> &'a str { + let encoded_len = engine + .encode_slice(self.hash.as_slice(), &mut buf[..]) + .unwrap(); + //debug_assert_eq!(encoded_len, buf.len()); + + std::str::from_utf8(&buf[..encoded_len]).expect("Invalid UTF8") + } +} diff --git a/src/blockid.rs b/src/blockid.rs deleted file mode 100644 index 486905a..0000000 --- a/src/blockid.rs +++ /dev/null @@ -1,166 +0,0 @@ -use crate::database::DB; -use base64; -use serde::Deserialize; -use serde_json; -use std::fs; -use std::fs::File; -use std::io::prelude::*; -use std::io::SeekFrom; -use std::io::Write; -use std::path::Path; - -#[derive(Debug)] -pub enum FileType { - File { - hash: String, - size: i64, - time: String, - }, - Folder { - metablockhash: String, - }, - SymLink, -} - -#[derive(Debug)] -pub struct FileEntry { - path: String, - metahash: String, - metasize: i64, - file_type: FileType, - block_lists: Vec, -} - -impl FileEntry { - pub(self) fn from_ientry(ientry: &IEntry) -> FileEntry { - let path = ientry.path.clone(); - let metahash = ientry.metahash.clone(); - let metasize = ientry.metasize; - let block_lists = if let Some(blocks) = &ientry.blocklists { - blocks.clone() - } else { - Vec::new() - }; - let file_type = match ientry.filetype.as_ref() { - "File" => FileType::File { - hash: ientry.hash.clone().unwrap(), - size: ientry.size.unwrap(), - time: ientry.time.clone().unwrap(), - }, - "Folder" => FileType::Folder { - metablockhash: ientry.metablockhash.clone().unwrap(), - }, - _ => FileType::SymLink, - }; - - FileEntry { - path, - metahash, - metasize, - file_type, - block_lists, - } - } - - pub fn is_file(&self) -> bool { - match self.file_type { - FileType::File { .. } => true, - _ => false, - } - } - - pub fn is_folder(&self) -> bool { - match self.file_type { - FileType::Folder { .. } => true, - _ => false, - } - } - - pub fn restore_file( - &self, - db: &DB, - restore_path: &str - ) { - let root_path = Path::new(restore_path); - let file_path = Path::new(&self.path[1..]); - let path = Path::join(root_path, file_path); - - match &self.file_type { - FileType::Folder { .. } => { - fs::create_dir_all(path).unwrap(); - } - FileType::File { hash, size, .. } => { - // Small files only have one block - if self.block_lists.is_empty() { - let mut file = File::create(path.clone()).unwrap(); - let block = db.get_content_block(hash); - if let Some(block) = block { - file.write_all(block.as_ref()).unwrap(); - } else if *size > 0 { - println!("Missing block {} for {}", hash, path.to_str().unwrap()); - } - } else { - let mut file = File::create(path.clone()).unwrap(); - // Each blockid points to a list of blockids - for (blhi, blh) in self.block_lists.iter().enumerate() { - let blockhashoffset = blhi * db.offset_size(); - let binary_hashes = db.get_content_block(blh); - if let Some(binary_hashes) = binary_hashes { - for (bi, hash) in binary_hashes.chunks(db.hash_size()).enumerate() { - let hash = base64::encode(hash); - let block = db.get_content_block(&hash); - - if let Some(block) = block { - file.seek(SeekFrom::Start( - (blockhashoffset + bi * db.block_size()) as u64, - )) - .unwrap(); - file.write_all(&block).unwrap(); - } else { - println!( - "Failed to find block {} for {}", - hash, - path.to_str().unwrap() - ); - } - } - } else { - println!( - "Failed to find blocklist {} for {}", - blh, - path.to_str().unwrap() - ); - } - } - } - } - _ => (), - } - } -} - -#[derive(Deserialize)] -pub(self) struct IEntry { - pub(self) hash: Option, - pub(self) metablockhash: Option, - pub(self) metahash: String, - pub(self) metasize: i64, - pub(self) path: String, - #[serde(rename = "type")] - pub(self) filetype: String, - pub(self) size: Option, - pub(self) time: Option, - pub(self) blocklists: Option>, -} - -/// Accepts the dlist as a string (must be read in first) -/// Returns a Vec of FileEntrys -pub fn parse_dlist(dlist: &str) -> Vec { - let mut file_entries = Vec::new(); - let entry_list: Vec = serde_json::from_str(dlist).unwrap(); - for entry in entry_list { - file_entries.push(FileEntry::from_ientry(&entry)); - } - - file_entries -} diff --git a/src/database.rs b/src/database.rs index 44b9ba4..f672dfd 100644 --- a/src/database.rs +++ b/src/database.rs @@ -1,14 +1,27 @@ -use base64; +use crate::blockhash::BlockIdHash; +use crate::ziparchive::BlockLocation; +use crate::ziparchive::MyCloneFileConfig; +use crate::ziparchive::MyCloneFileReader; +use crate::ziparchive::ZipArchiveWrapper; +use crate::ziparchive::ZipLocation; +use base64::engine::general_purpose; +use base64::Engine; +use eyre::eyre; +use eyre::Context; +use eyre::Result; use indicatif::{ProgressBar, ProgressStyle}; -use rayon::prelude::*; +use rayon::prelude::IntoParallelRefIterator; +use rayon::prelude::ParallelIterator; use serde::Deserialize; -use serde_json; -use std::fs::File; -use std::io::BufReader; +use smallvec::SmallVec; +use std::collections::HashMap; use std::io::Read; use std::path::Path; -use unqlite::{Transaction, UnQLite, KV}; -use zip; +use std::path::PathBuf; +use std::sync::atomic::AtomicU32; +use std::sync::Arc; +use std::sync::Mutex; +use zip::ZipArchive; #[derive(Deserialize)] #[allow(dead_code)] // Will use all these fields in the future @@ -29,85 +42,255 @@ struct Manifest { pub(self) app_version: String, } -pub struct DB { - conn: UnQLite, +pub struct HashToPath { + /// Maps hash (without base64) to location in dblock.zip + /// + /// May be faster, but it's memory-intensive + hash2path: HashMap, BlockLocation>, +} +impl HashToPath { + pub fn new() -> Self { + Self { + hash2path: HashMap::new(), + } + } + + pub fn get_zip_path_by_block_id(&self, block_id: &BlockIdHash) -> Option { + self.hash2path + .get(&block_id.hash) + .map(|v| v.ziplocation.path.clone()) + } + + pub fn get_location_by_block_id(&self, block_id: &BlockIdHash) -> Option { + self.hash2path.get(&block_id.hash).cloned() + } + + pub fn insert_location( + &mut self, + hash: SmallVec<[u8; 32]>, + ziplocation: &Arc, + entry_index: usize, + ) { + self.hash2path.insert( + hash.into(), + BlockLocation { + ziplocation: ziplocation.clone(), + file_index: entry_index as u32, + }, + ); + } +} +pub struct HashToBlocks { + /// Maps zip file name to a singleton zip reader + zip2ziparchive: HashMap, + + /// zip_entry_name -> zip_name + /// + /// takes a lot of RAM so it's not used by default + hash2path: Option, +} + +impl HashToBlocks { + pub fn new(use_hash_to_path: bool) -> Self { + Self { + hash2path: if use_hash_to_path { + Some(HashToPath::new()) + } else { + None + }, + zip2ziparchive: HashMap::new(), + } + } + + pub fn get_location_by_block_id(&self, block_id: &BlockIdHash) -> Option { + if let Some(hash2path) = &self.hash2path { + hash2path.get_location_by_block_id(block_id) + } else { + self.get_location_by_block_id_purezip(block_id) + } + } + pub fn get_location_by_block_id_purezip( + &self, + block_id: &BlockIdHash, + ) -> Option { + let buf = &mut [0u8; 48]; + let name_reencoded = block_id.as_base64_urlsafe(buf); + for ziparch in self.zip2ziparchive.values() { + let location = ziparch.get_block_location(name_reencoded); + if location.is_some() { + return location; + } + } + None + } + + pub fn get_zip_archive(&self, zip_filename: &str) -> Option> { + let zip = self.zip2ziparchive.get(zip_filename); + + zip.map(|zip| zip.archive.clone()) + } + + pub fn get_zip_by_block_id( + &self, + block_id: &BlockIdHash, + ) -> Option> { + if let Some(hash2path) = &self.hash2path { + let zname = hash2path.get_zip_path_by_block_id(block_id); + let zname = zname.map(|n| n.to_string_lossy().to_string()); + zname.and_then(|zname| self.get_zip_archive(&zname)) + } else { + self.get_zip_by_block_id_purezip(block_id) + } + } + + pub fn get_zip_by_block_id_purezip( + &self, + block_id: &BlockIdHash, + ) -> Option> { + let buf = &mut [0u8; 48]; + let name_reencoded = block_id.as_base64_urlsafe(buf); + for ziparch in self.zip2ziparchive.values() { + if ziparch.contains_file_name(name_reencoded) { + return Some(ziparch.archive.clone()); + } + } + None + } +} + +pub struct DFileDatabase { + inner: Arc>, manifest: Manifest, } -impl DB { - pub fn new(file: &str, manifest: &str) -> DB { - let conn = UnQLite::create(file); - let manifest: Manifest = serde_json::from_str(manifest).unwrap(); - DB { conn, manifest } +impl DFileDatabase { + pub fn new(manifest_bytes: &[u8], use_hash_to_path: bool) -> Result { + let manifest: Manifest = serde_json::from_slice(manifest_bytes)?; + + let inner = Arc::new(Mutex::new(HashToBlocks::new(use_hash_to_path))); + let db = Self { inner, manifest }; + Ok(db) } - pub fn create_block_id_to_filenames(self, paths: &[String]) -> Self { + pub fn create_block_id_to_filenames(&self, paths: &[PathBuf]) -> Result<()> { // Iterate through dblocks, adding them to the db let pb = ProgressBar::new(paths.len() as u64); pb.set_style( ProgressStyle::default_bar() .template( "[{elapsed_precise}] {wide_bar:40.cyan/blue} {pos:>7}/{len:7} {msg} [{eta_precise}]", - ) + )? .progress_chars("##-"), ); - let conn = &self.conn; - paths - .par_iter() - .map(|path| { - // In this stage, open the file - let file = File::open(&Path::new(path)).unwrap(); - let buf = BufReader::new(file); - let zip = zip::ZipArchive::new(buf).unwrap(); - (zip, path) - }) - .map(|(mut zip, path)| { - // Convert to a list of paths - let paths: Vec = (0..zip.len()) - .map(|i| zip.by_index(i).unwrap().name().to_string()) - .collect(); - (paths, path) - }) - .for_each(|(paths, path)| { - let bytes = path.as_bytes(); - for p in paths { - let hash = base64::decode_config(&p, base64::URL_SAFE).unwrap(); - conn.kv_store(hash, bytes).unwrap(); - } - conn.commit().unwrap(); - pb.inc(1); - }); - - self - } - - pub fn get_filename_from_block_id(&self, block_id: &str) -> Option { - let conn = &self.conn; - // println!("{}", block_id); - // let converted_block_id = base64_url_to_plain(block_id); - let result = conn.kv_fetch(base64::decode_config(block_id, base64::STANDARD).unwrap()); - if let Ok(path_bytes) = result { - Some(String::from_utf8(path_bytes).unwrap()) - } else { - None + paths.par_iter().try_for_each(|zip_path| -> Result<()> { + self.import_from_zip(zip_path) + .wrap_err_with(|| format!("import_from_zip: {:?}", zip_path))?; + pb.inc(1); + + Ok(()) + })?; + + Ok(()) + } + + pub fn import_from_zip(&self, zip_path: &PathBuf) -> Result<()> { + // In this stage, open the file + let zip_path = Path::new(&zip_path).to_path_buf(); + let config = Arc::new(MyCloneFileConfig { + path: zip_path.clone(), + buf_capacity: AtomicU32::new(1024), + }); + let zipbuf = MyCloneFileReader::new(config.clone())?; + let ziparch = zip::ZipArchive::new(zipbuf)?; + + let arc_ziploc = Arc::new(ZipLocation { path: zip_path }); + + if self.inner.lock().unwrap().hash2path.is_some() { + self.register_hash_to_path(&ziparch, arc_ziploc.clone())?; + } + + self.register_zip_archive(config, arc_ziploc, ziparch); + + Ok(()) + } + /// Remembers zip file names in a hashmap + /// + /// zip_entry_name -> zip_name + pub fn register_hash_to_path( + &self, + ziparch: &ZipArchive, + ziplocation: Arc, + ) -> Result<()> { + for (index, file_name) in ziparch.file_names_ordered().enumerate() { + // file_name is a hash in base64 + let hash = general_purpose::URL_SAFE.decode(file_name)?; + + if hash.len() > 32 { + Err(eyre!("warn: hash len:{} requires heap alloc", hash.len()))? + } + + let mut inner = self.inner.lock().unwrap(); + if let Some(hash2path) = &mut inner.hash2path { + hash2path.insert_location(hash.into(), &ziplocation, index); + } + } + Ok(()) + } + + pub fn register_zip_archive( + &self, + config: Arc, + ziplocation: Arc, + ziparch: ZipArchive, + ) { + use std::sync::atomic::Ordering; + config.buf_capacity.store(32 * 1024, Ordering::Relaxed); + let path_str = ziplocation.path.to_string_lossy().to_string(); + let wrapper = ZipArchiveWrapper { + ziplocation, + archive: ziparch, + }; + + { + let mut inner = self.inner.lock().unwrap(); + inner.zip2ziparchive.insert(path_str, wrapper); } } - pub fn get_content_block(&self, block_id: &str) -> Option> { - let mut output = Vec::new(); - if let Some(filename) = self.get_filename_from_block_id(block_id) { - let mut zip = zip::ZipArchive::new(File::open(filename).unwrap()).unwrap(); - let mut block = zip - .by_name(&base64::encode_config( - &base64::decode(block_id).unwrap(), - base64::URL_SAFE, - )) - .unwrap(); - block.read_to_end(&mut output).unwrap(); + pub fn get_block_id_location(&self, block_id: &BlockIdHash) -> Option { + self.inner + .lock() + .unwrap() + .get_location_by_block_id(block_id) + } + + pub fn get_zip_by_block_id( + &self, + block_id: &BlockIdHash, + ) -> Option> { + self.inner.lock().unwrap().get_zip_by_block_id(block_id) + } + + pub fn get_content_block( + &self, + block_id: &BlockIdHash, + block_buf: &mut Vec, + ) -> Result> { + let ziparch = self.get_zip_by_block_id(block_id); + + if let Some(mut ziparch) = ziparch { + let base64_buf = &mut [0u8; 48]; + let name_reencoded = block_id.as_base64_urlsafe(base64_buf); + let mut block = ziparch + .by_name(name_reencoded) + .wrap_err("block file by name not found even though we indexed it before")?; + let n = block + .read_to_end(block_buf) + .wrap_err_with(|| format!("reading block file {:?}", block_id))?; - Some(output) + Ok(Some(n)) } else { - None + Ok(None) } } diff --git a/src/dfileentry.rs b/src/dfileentry.rs new file mode 100644 index 0000000..2bdbb76 --- /dev/null +++ b/src/dfileentry.rs @@ -0,0 +1,144 @@ +use crate::blockhash::BlockIdHash; +use crate::dfiletype::FileType; +use crate::stripbom::strip_bom_from_bufread; +use crate::FileEntries; +use eyre::eyre; +use eyre::Context; +use eyre::Result; +use serde::Deserialize; +use serde_json::de::IoRead; +use serde_json::Deserializer; +use smallvec::SmallVec; +use std::io::prelude::*; + +#[derive(Debug, Clone, Eq, PartialEq, PartialOrd, Ord)] +pub struct FileEntry { + pub path: String, + #[allow(unused)] + pub metahash: String, + #[allow(unused)] + pub metasize: i64, + pub file_type: FileType, + pub block_lists: SmallVec<[BlockIdHash; 1]>, +} + +impl FileEntry { + pub(self) fn from_ientry(ientry: &IEntry) -> Result { + let path = ientry.path.clone(); + let metahash = ientry.metahash.clone(); + let metasize = ientry.metasize; + let mut block_lists = SmallVec::new(); + + if let Some(blocks) = &ientry.blocklists { + for block in blocks { + block_lists.push( + BlockIdHash::from_base64(block) + .ok_or_else(|| eyre!("blocklists BlockIdHash::from_base64 fail"))?, + ); + } + }; + let file_type = match ientry.filetype.as_ref() { + "File" => FileType::File { + hash: ientry + .hash + .as_ref() + .map(|hash| { + BlockIdHash::from_base64(hash) + .ok_or_else(|| eyre!("ientry.hash BlockIdHash::from_base64 fail")) + }) + .ok_or_else(|| eyre!("hash not found"))??, + size: ientry.size.ok_or_else(|| eyre!("size not found"))?, + time: ientry.time.clone().ok_or_else(|| eyre!("time not found"))?, + }, + "Folder" => FileType::Folder { + metablockhash: ientry + .metablockhash + .clone() + .ok_or_else(|| eyre!("metablockhash not found"))?, + }, + _ => FileType::SymLink, + }; + + Ok(FileEntry { + path, + metahash, + metasize, + file_type, + block_lists, + }) + } + + pub fn is_file(&self) -> bool { + self.file_type.is_file() + } + + pub fn is_folder(&self) -> bool { + self.file_type.is_folder() + } + + /// How much bytes it probably takes on disk when restoring + pub fn predicted_time(&self) -> u64 { + // Not an accurate number + let psize = 4 * 1024 + self.path.len() as u64; + if let FileType::File { size, .. } = self.file_type { + psize + size as u64 + } else { + psize + } + } + + pub fn bytes_size(&self) -> u64 { + if let FileType::File { size, .. } = self.file_type { + size as u64 + } else { + 0 + } + } +} + +#[derive(Deserialize)] +pub(self) struct IEntry { + #[serde(rename = "type")] + pub(self) filetype: String, + pub(self) path: String, + pub(self) hash: Option, + pub(self) size: Option, + + pub(self) metablockhash: Option, + pub(self) metahash: String, + pub(self) metasize: i64, + + pub(self) time: Option, + pub(self) blocklists: Option>, +} + +#[allow(unused)] +/// Accepts the dlist as a string (must be read in first) +/// Returns a Vec of FileEntrys +pub fn parse_dlist(dlist: &[u8]) -> Result { + let file_entries = parse_dlist_read(dlist)?; + + Ok(file_entries) +} + +/// Accepts the dlist as a Read trait +/// Returns a Vec of FileEntrys +pub fn parse_dlist_read(mut rdr: R) -> Result { + let mut file_entries = Vec::new(); + + strip_bom_from_bufread(&mut rdr)?; + + let iread = IoRead::new(rdr); + let mut de = Deserializer::new(iread); + let entry_list: Vec = + serde_path_to_error::deserialize(&mut de).wrap_err("deserialize entry_list")?; + + for entry in entry_list { + let entry = FileEntry::from_ientry(&entry).wrap_err("FileEntry::from_ientry")?; + file_entries.push(entry); + } + + Ok(FileEntries { + entries: file_entries, + }) +} diff --git a/src/dfiletype.rs b/src/dfiletype.rs new file mode 100644 index 0000000..10a0193 --- /dev/null +++ b/src/dfiletype.rs @@ -0,0 +1,32 @@ +use crate::blockhash::BlockIdHash; + +#[derive(Debug, Clone, Eq, PartialEq, PartialOrd, Ord)] +pub enum FileType { + File { + hash: BlockIdHash, + size: i64, + time: String, + }, + Folder { + metablockhash: String, + }, + SymLink, +} + +impl FileType { + pub fn is_file(&self) -> bool { + matches!(self, FileType::File { .. }) + } + + #[allow(unused)] + pub fn is_nonzero_file(&self) -> bool { + match self { + FileType::File { size, .. } => *size > 0, + _ => false, + } + } + + pub fn is_folder(&self) -> bool { + matches!(self, FileType::Folder { .. }) + } +} diff --git a/src/dhatprof.rs b/src/dhatprof.rs new file mode 100644 index 0000000..281c9d3 --- /dev/null +++ b/src/dhatprof.rs @@ -0,0 +1,15 @@ +#[cfg(feature = "dhat-heap")] +pub fn start_dhat_profiler() { + std::thread::spawn(|| { + let _profiler = dhat::Profiler::new_heap(); + + std::thread::sleep(std::time::Duration::from_secs(10 * 60)); + // save profile after 10 minutes + }); + + std::thread::sleep(std::time::Duration::from_millis(200)); +} + +/// Does nothing +#[cfg(not(feature = "dhat-heap"))] +pub fn start_dhat_profiler() {} diff --git a/src/flags.rs b/src/flags.rs new file mode 100644 index 0000000..a53a672 --- /dev/null +++ b/src/flags.rs @@ -0,0 +1,33 @@ +use clap::Parser; + +#[derive(Parser)] +#[command(author, version, about, long_about = None)] +pub struct RestoreFlags { + /// the location of the backup + #[arg(short, long)] + pub backup_dir: String, + + /// a location to restore to + #[arg(short, long, value_name = "FILE")] + pub restore_dir: Option, + + /// 1 thread will save and read files sequentially + #[arg(short, long, default_value_t = 4)] + pub threads_rayon: usize, + + /// displays progress bar in CLI + #[arg(short, long)] + pub progress_bar: bool, + + /// true if use additional hashmap to speed up hashed name lookup. Increases memory usage. + #[arg(long)] + pub hash_to_path: bool, + + /// true to restore windows backup on linux + #[arg(long)] + pub replace_backslash_to_slash: Option, + + /// true to verify without writing files to disk + #[arg(long)] + pub verify_only: bool, +} diff --git a/src/hexdisplay.rs b/src/hexdisplay.rs new file mode 100644 index 0000000..2655c71 --- /dev/null +++ b/src/hexdisplay.rs @@ -0,0 +1,59 @@ +pub struct HexDisplayBytes<'a>(pub &'a [u8]); +impl<'a> std::fmt::Display for HexDisplayBytes<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + for byte in self.0.as_ref().iter() { + let (high, low) = byte2hex(*byte, HEX_CHARS_LOWER); + + write!(f, "{}{}", high as char, low as char)?; + } + + Ok(()) + } +} + +pub struct EscapeWholeString<'a>(pub &'a [u8]); +impl<'a> std::fmt::Display for EscapeWholeString<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + for byte in self.0.as_ref().iter() { + let (high, low) = byte2hex(*byte, HEX_CHARS_LOWER); + + write!(f, "\\x{}{}", high as char, low as char)?; + } + + Ok(()) + } +} + +pub struct EscapeRawString<'a>(pub &'a [u8]); +impl<'a> std::fmt::Display for EscapeRawString<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "\"")?; + for &b in self.0.as_ref().iter() { + escape_byte_maybe(f, b)?; + } + write!(f, "\"")?; + + Ok(()) + } +} + +fn escape_byte_maybe(f: &mut std::fmt::Formatter<'_>, b: u8) -> std::fmt::Result { + if b > 32 && b < 126 && b != b'"' { + write!(f, "{}", b as char)?; + } else { + let (high, low) = byte2hex(b, HEX_CHARS_LOWER); + + write!(f, "\\x{}{}", high as char, low as char)?; + } + Ok(()) +} + +const HEX_CHARS_LOWER: &[u8; 16] = b"0123456789abcdef"; + +/// returns 2 chars representing byte in hex +fn byte2hex(byte: u8, table: &[u8; 16]) -> (u8, u8) { + let high = table[((byte & 0xf0) >> 4) as usize]; + let low = table[(byte & 0x0f) as usize]; + + (high, low) +} diff --git a/src/main.rs b/src/main.rs index ac3725a..44315ab 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,127 +1,286 @@ -mod blockid; +#![warn(rust_2018_idioms)] + +mod blockhash; mod database; +mod dfileentry; +mod dfiletype; +mod flags; +mod hexdisplay; +mod restoring; +mod sorting; +mod stripbom; +mod ziparchive; + +use crate::flags::RestoreFlags; +use crate::restoring::{restore_entry, RestoreContext, RestoreParams, RestoreSummary}; +use crate::sorting::sort_files_sequentially; +use crate::stripbom::StripBom; -use blockid::*; +use clap::Parser; use database::*; -use num_cpus; +use dfileentry::*; +use dhatprof::start_dhat_profiler; +use eyre::eyre; +use eyre::{Context, Result}; use pbr::ProgressBar; use rayon::prelude::*; use std::fs; use std::fs::File; -use std::io::{stdin, Read}; -use std::path::Path; +use std::io::{BufReader, Read}; +use std::path::{Path, PathBuf}; use std::sync::{Arc, Mutex}; -use zip; +mod dhatprof; fn main() { - println!("Enter the location of the backup:"); - let mut backup_dir = String::new(); - stdin() - .read_line(&mut backup_dir) - .expect("Did not enter a location."); - println!(); - let backup_dir = backup_dir.trim(); + start_dhat_profiler(); - println!("Enter a location to restore to:"); - let mut restore_dir = String::new(); - stdin() - .read_line(&mut restore_dir) - .expect("Did not enter a location."); - println!(); - let restore_dir = restore_dir.trim(); + let result = run(); + match result { + Err(err) => { + println!("err: {:?}", err); + } + Ok(_) => { + println!("Finished without errors!"); + } + } +} - let db_location = Path::join(Path::new(backup_dir), Path::new("index.db")); - let db_location = db_location.to_str().unwrap(); +fn filename_ends_with>(path: P, suffix: &str) -> bool { + path.as_ref() + .file_name() + .and_then(|name| name.to_str()) + .map(|name| name.ends_with(suffix)) + .unwrap_or(false) +} - println!( - "Enter number of threads to use (Default {}):", - num_cpus::get() - ); - let mut cpu_input = String::new(); - stdin() - .read_line(&mut cpu_input) - .expect("Did not enter a number"); - let cpu_count: usize = match cpu_input.trim().parse() { - Ok(i) => i, - Err(..) => num_cpus::get(), +fn path_is_dlist_zip>(path: P) -> bool { + filename_ends_with(path, "dlist.zip") +} +fn path_is_dblock_zip>(path: P) -> bool { + filename_ends_with(path, "dblock.zip") +} + +pub struct FileEntries { + pub entries: Vec, +} + +/// Open dlist file and parse json inside +fn parse_dlist_file>(dlist_path: P) -> Result { + let dlist_reader = File::open(dlist_path.as_ref()) + .wrap_err_with(|| format!("open {:?}", dlist_path.as_ref()))?; + let mut dlist_zip = zip::ZipArchive::new(dlist_reader)?; + let filelist_name = "filelist.json"; + let dlist_file = dlist_zip.by_name(filelist_name)?; + let bufrdr = BufReader::with_capacity(32 * 1024, dlist_file); + let list = parse_dlist_read(bufrdr).wrap_err_with(|| { + format!( + "parse_dlist {:?} / {:?}", + dlist_path.as_ref(), + filelist_name + ) + })?; + + Ok(list) +} + +/// Open Manifest from zip +fn read_manifest>(dlist_path: P) -> Result> { + let manifest_file = File::open(dlist_path.as_ref())?; + let mut manifest_zip = zip::ZipArchive::new(manifest_file)?; + let mut manifest_file = manifest_zip.by_name("manifest")?; + let mut manifest_contents = String::new(); + manifest_file + .read_to_string(&mut manifest_contents) + .wrap_err_with(|| format!("read manifest from {:?}", dlist_path.as_ref()))?; + let manifest_contents = manifest_contents.strip_bom(); + let manifest_contents = manifest_contents.trim(); + Ok(manifest_contents.into()) +} + +fn run() -> Result<()> { + let args = RestoreFlags::parse(); + let backup_dir = args.backup_dir.trim().to_string(); + let restore_dir = if !args.verify_only { + let dir = args + .restore_dir + .as_ref() + .ok_or_else(|| eyre!("--restore_dir not provided"))?; + Some(dir.trim()) + } else { + None }; - println!(); // Set CPU count rayon::ThreadPoolBuilder::new() - .num_threads(cpu_count) + .num_threads(args.threads_rayon) .build_global() .unwrap(); // Find newest dlist - let mut dlist_file_names: Vec = fs::read_dir(&backup_dir) - .unwrap() + let mut dlist_file_paths: Vec = fs::read_dir(&backup_dir)? .filter_map(Result::ok) - .filter(|f| f.path().to_str().unwrap().ends_with("dlist.zip")) - .map(|f| f.path().to_str().unwrap().to_string()) + .filter(|f| path_is_dlist_zip(f.path())) + .map(|f| f.path()) .collect(); - dlist_file_names.sort(); + dlist_file_paths.sort(); + + let newest_dlist = dlist_file_paths + .last() + .ok_or_else(|| eyre!("last modified dlist file not found"))?; + + println!( + "Newest: {:?} appears to be newest dlist, using it.", + newest_dlist + ); + println!("Parsing manifest"); + let manifest_contents = read_manifest(newest_dlist)?; + + // Open dblock db connection and build db + println!(); + let db_join = std::thread::spawn(move || -> Result { + println!("Listing dblocks"); + // Get list of dblocks + let zip_file_names: Vec = fs::read_dir(backup_dir) + .wrap_err("read_dir(backup_dir)")? + .filter_map(Result::ok) + .filter(|f| path_is_dblock_zip(f.path())) + .map(|f| f.path()) + .collect(); - let dlist = dlist_file_names[dlist_file_names.len() - 1].clone(); + println!("Found {} dblocks", zip_file_names.len()); + println!("Indexing dblocks"); + let dblock_db = DFileDatabase::new(&manifest_contents, args.hash_to_path)?; + dblock_db.create_block_id_to_filenames(&zip_file_names)?; + Ok(dblock_db) + }); - println!("{} appears to be newest dlist, using it.", dlist); println!("Parsing dlist"); + let file_entries = parse_dlist_file(newest_dlist)?; + let summary = calculate_summary(&file_entries.entries); - // Open dlist file - let mut dlist_zip = zip::ZipArchive::new(File::open(dlist.clone()).unwrap()).unwrap(); - let mut dlist_file = dlist_zip.by_name("filelist.json").unwrap(); - let mut dlist_contents = String::new(); - dlist_file.read_to_string(&mut dlist_contents).unwrap(); - let file_entries = parse_dlist(&dlist_contents); + let dblock_db = db_join.join().unwrap()?; - // Open Manifest - let mut manifest_zip = zip::ZipArchive::new(File::open(dlist.clone()).unwrap()).unwrap(); - let mut manifest_file = manifest_zip.by_name("manifest").unwrap(); - let mut manifest_contents = String::new(); - manifest_file - .read_to_string(&mut manifest_contents) - .unwrap(); - let manifest_contents = manifest_contents.replace("\u{feff}", ""); - let manifest_contents = manifest_contents.trim(); + print_summary(&summary); - let file_count = file_entries.iter().filter(|f| f.is_file()).count(); - println!("{} files to be restored", file_count); - let folder_count = file_entries.iter().filter(|f| f.is_folder()).count(); - println!("{} folders to be restored", folder_count); - println!(); + let restore_params = RestoreParams { + db: Arc::new(dblock_db), + restore_path: restore_dir, + replace_backslash_to_slash: args.replace_backslash_to_slash.unwrap_or(!cfg!(windows)), + summary, + }; + restore_all(&args, &restore_params, file_entries)?; - // Get list of dblocks - let zip_file_names: Vec = fs::read_dir(backup_dir) - .unwrap() - .filter_map(Result::ok) - .map(|f| f.path().to_str().unwrap().to_string()) - .filter(|f| f.ends_with("dblock.zip")) + Ok(()) +} + +fn restore_all( + args: &RestoreFlags, + params: &RestoreParams<'_>, + file_entries: FileEntries, +) -> Result<()> { + let folders: Vec = file_entries + .entries + .iter() + .filter(|f| f.is_folder()) + .cloned() .collect(); + println!("Sorting file_entries"); + let doing = if params.restore_path.is_some() { + "Restoring" + } else { + "Verifying" + }; + let pb = if args.progress_bar { + Some(Arc::new(Mutex::new(ProgressBar::new( + params.summary.folder_count as u64, + )))) + } else { + None + }; - println!("Found {} dblocks", zip_file_names.len()); + let dbc = params.db.clone(); + let sort_join = std::thread::spawn(move || -> FileEntries { + let mut file_entries = file_entries; + sort_files_sequentially(&mut file_entries.entries, &dbc); + file_entries + }); - // Open dblock db connection and build db - println!(); - println!("Indexing dblocks"); - let dblock_db = - DB::new(db_location, &manifest_contents).create_block_id_to_filenames(&zip_file_names); - - println!("Restoring directory structure"); - let mut pb = ProgressBar::new(folder_count as u64); - for d in file_entries.iter().filter(|f| f.is_folder()) { - d.restore_file(&dblock_db, &restore_dir); - pb.inc(); + println!("{doing} directory structure"); + + folders.iter().par_bridge().try_for_each_with( + RestoreContext::new(), + |ctx, entry_folder| -> Result<()> { + restore_entry(entry_folder, params, ctx) + .wrap_err_with(|| format!("restoring dir {:?}", entry_folder.path))?; + if let Some(pb) = &pb { + pb.lock().unwrap().inc(); + } + Ok(()) + }, + )?; + if let Some(pb) = &pb { + pb.lock().unwrap().tick(); + } + + if !sort_join.is_finished() { + println!("Waiting for sorting to finish"); } + let file_entries = sort_join.join().unwrap(); + println!(); - println!("Restoring files"); - let pb = Arc::new(Mutex::new(ProgressBar::new(file_count as u64))); + println!("{doing} files"); + let pb = if args.progress_bar { + Some(Arc::new(Mutex::new(ProgressBar::new( + params.summary.predicted_bytes, + )))) + } else { + None + }; file_entries - .par_iter() + .entries + .iter() .filter(|f| f.is_file()) - .for_each(|f| { - f.restore_file(&dblock_db, &restore_dir); - pb.lock().unwrap().inc(); - }); + .par_bridge() + .try_for_each_with(RestoreContext::new(), |ctx, entry_file| -> Result<()> { + restore_entry(entry_file, params, ctx) + .wrap_err_with(|| format!("restoring file {:?}", entry_file.path))?; + if let Some(pb) = &pb { + pb.lock().unwrap().add(entry_file.predicted_time()); + } + Ok(()) + })?; + if let Some(pb) = &pb { + pb.lock().unwrap().tick(); + } + println!(); + + Ok(()) } +fn calculate_summary(entries: &[FileEntry]) -> RestoreSummary { + let file_count = entries.iter().filter(|f| f.is_file()).count(); + let folder_count = entries.iter().filter(|f| f.is_folder()).count(); + let predicted_bytes: u64 = entries.iter().map(|f| f.predicted_time()).sum(); + let total_bytes: u64 = entries.iter().map(|f| f.bytes_size()).sum(); + RestoreSummary { + file_count, + folder_count, + total_bytes, + predicted_bytes, + } +} + +fn print_summary(summary: &RestoreSummary) { + println!("{} files to be restored", summary.file_count); + println!("{} folders to be restored", summary.folder_count); + println!("{} bytes in files", summary.total_bytes); + println!( + "{} bytes on drive to be restored (predicted)", + summary.predicted_bytes + ); +} + +#[cfg(feature = "dhat-heap")] +#[global_allocator] +static ALLOC: dhat::Alloc = dhat::Alloc; diff --git a/src/restoring.rs b/src/restoring.rs new file mode 100644 index 0000000..c78dcac --- /dev/null +++ b/src/restoring.rs @@ -0,0 +1,345 @@ +use crate::{ + blockhash::BlockIdHash, database::DFileDatabase, dfileentry::FileEntry, dfiletype::FileType, + hexdisplay::HexDisplayBytes, +}; +use eyre::eyre; +use eyre::{Context, Result}; +use sha2::{Digest, Sha256}; +use std::sync::Arc; +use std::{ + cell::RefCell, + fs::{self, File}, + io::{Seek, SeekFrom, Write}, + path::{Path, PathBuf}, +}; + +#[derive(Clone)] +pub struct RestoreContext { + pub block_buffer: RefCell>, + pub block_hashes_buffer: RefCell>, +} + +impl RestoreContext { + pub fn new() -> Self { + Self { + block_buffer: RefCell::new(Vec::with_capacity(8 * 1024)), + block_hashes_buffer: RefCell::new(Vec::with_capacity(8 * 1024)), + } + } +} + +struct RestoreFileContext<'a> { + restore_context: &'a RestoreContext, + db: &'a DFileDatabase, + + entry: &'a FileEntry, + hash: &'a BlockIdHash, + size: i64, + + debug_location: bool, + strict_block_size: bool, + hasher: RefCell>, + + /// None if only verifying + absolute_path: Option<&'a PathBuf>, + /// None if only verifying + relative_file_path: Option<&'a PathBuf>, + + out_file: RefCell>, +} + +pub struct RestoreSummary { + pub file_count: usize, + pub folder_count: usize, + pub total_bytes: u64, + pub predicted_bytes: u64, +} + +pub struct RestoreParams<'a> { + pub db: Arc, + pub restore_path: Option<&'a str>, + pub replace_backslash_to_slash: bool, + pub summary: RestoreSummary, +} +/// Returns Some(absolute, relative) +pub fn calculate_path(entry: &FileEntry, params: &RestoreParams<'_>) -> Option<(PathBuf, PathBuf)> { + if let Some(restore_path) = ¶ms.restore_path { + let root_path = Path::new(restore_path); + let dfile_path = &entry.path[0..]; + let mut dfile_path = dfile_path.replacen(":\\", "\\", 1); + if params.replace_backslash_to_slash { + dfile_path = dfile_path.replace('\\', "/"); + } + let relative_file_path = PathBuf::from(&dfile_path); + + let path = Path::join(root_path, &relative_file_path); + Some((path, relative_file_path)) + } else { + None + } +} + +pub fn restore_entry( + entry: &FileEntry, + params: &RestoreParams<'_>, + restore_context: &RestoreContext, +) -> Result<()> { + let paths = calculate_path(entry, params); + let absolute_path = paths.as_ref().map(|v| &v.0); + let relative_file_path = paths.as_ref().map(|v| &v.1); + + match &entry.file_type { + FileType::Folder { .. } => { + if let Some(path) = absolute_path { + fs::create_dir_all(path)?; + } + } + FileType::File { hash, size, .. } => { + restore_file( + params, + restore_context, + hash, + *size, + absolute_path, + relative_file_path, + entry, + )?; + } + _ => (), + } + Ok(()) +} +fn restore_file( + params: &RestoreParams<'_>, + restore_context: &RestoreContext, + hash: &BlockIdHash, + size: i64, + absolute_path: Option<&PathBuf>, + relative_file_path: Option<&PathBuf>, + entry: &FileEntry, +) -> Result<()> { + let hasher = if size > 0 { Some(Sha256::new()) } else { None }; + let out_file = if let Some(path) = &absolute_path { + Some(File::create(path)?) + } else { + None + }; + let context = RestoreFileContext { + restore_context, + entry, + db: ¶ms.db, + debug_location: false, + strict_block_size: true, + hash, + size, + hasher: RefCell::new(hasher), + absolute_path, + relative_file_path, + out_file: RefCell::new(out_file), + }; + + // Small files only have one block + if entry.block_lists.is_empty() { + restore_file_singleblock(&context)?; + } else { + restore_file_multiblock(&context)?; + } + + check_file_hash(&context)?; + Ok(()) +} + +fn restore_file_singleblock(ctx: &RestoreFileContext<'_>) -> Result<()> { + debug_block_restore_maybe(ctx, true); + + if ctx.size <= 0 { + return Ok(()); + } + + let buf = &mut ctx.restore_context.block_buffer.borrow_mut(); + buf.clear(); + let block = ctx.db.get_content_block(ctx.hash, buf)?; + let _len = block.ok_or(|| eyre!("Missing block {} for {:?}", ctx.hash, ctx.absolute_path)); + + if let Some(out_file) = ctx.out_file.borrow_mut().as_mut() { + out_file + .write_all(buf.as_slice()) + .wrap_err("write single-block file")?; + } + update_hasher_maybe(ctx, buf); + + Ok(()) +} +fn debug_block_restore_maybe(ctx: &RestoreFileContext<'_>, is_multi: bool) { + if !ctx.debug_location { + return; + } + + let multi_or_single = if is_multi { "multi" } else { "single" }; + let hash = if is_multi { + Some(ctx.hash) + } else { + ctx.entry.block_lists.first() + }; + let loc = hash.and_then(|hash| ctx.db.get_block_id_location(hash)); + println!( + "restoring file ({}) {:?}, index:{:?}", + multi_or_single, + ctx.relative_file_path, + loc.map(|loc| loc.file_index) + ); +} + +fn restore_file_multiblock(ctx: &RestoreFileContext<'_>) -> Result<()> { + debug_block_restore_maybe(ctx, true); + + // Each blockid points to a list of blockids + for (main_hash_index, main_hash) in ctx.entry.block_lists.iter().enumerate() { + restore_file_multiblock_main(ctx, main_hash_index, main_hash)?; + } + + Ok(()) +} + +fn update_hasher_maybe(ctx: &RestoreFileContext<'_>, buf: &[u8]) { + let mut hasher = ctx.hasher.borrow_mut(); + if let Some(h) = hasher.as_mut() { + h.update(buf); + } +} + +fn restore_file_multiblock_block( + ctx: &RestoreFileContext<'_>, + block_index: usize, + block_hash: &[u8], + blockhashoffset: usize, + last_block_size: &mut Option, +) -> Result<()> { + //let bhash = base64::encode(bhash); + let block_hash = BlockIdHash::from_bytes(block_hash) + .ok_or_else(|| eyre!("binary hash len is not 32 bytes"))?; + let buf = &mut ctx.restore_context.block_buffer.borrow_mut(); + buf.clear(); + let block = ctx + .db + .get_content_block(&block_hash, buf) + .wrap_err_with(|| { + format!( + "get one of content blocks (number {}): {}", + block_index, block_hash + ) + })?; + + let _block_len = block.ok_or_else(|| { + eyre!( + "Failed to find block {} for {:?}", + block_hash, + ctx.absolute_path + ) + })?; + + if let Some(out_file) = ctx.out_file.borrow_mut().as_mut() { + let full_block = ctx.db.block_size(); + let offset = (blockhashoffset + block_index * full_block) as u64; + out_file + .seek(SeekFrom::Start(offset)) + .wrap_err("seek blockhashoffset + bi * full_block")?; + out_file + .write_all(buf.as_slice()) + .wrap_err("write (multi) block")?; + } + update_hasher_maybe(ctx, buf); + check_strict_block(ctx, buf, last_block_size)?; + + Ok(()) +} + +fn check_strict_block( + ctx: &RestoreFileContext<'_>, + buf: &[u8], + last_block_size: &mut Option, +) -> Result<()> { + if !ctx.strict_block_size { + return Ok(()); + } + if let Some(last) = last_block_size { + let full_block = ctx.db.block_size(); + if *last != full_block { + Err(eyre!( + "last block size != full_block, {} != {}", + last, + full_block + ))?; + } + } + *last_block_size = Some(buf.len()); + + Ok(()) +} + +fn restore_file_multiblock_main( + ctx: &RestoreFileContext<'_>, + main_hash_index: usize, + main_hash: &BlockIdHash, +) -> Result<()> { + let blockhashoffset = main_hash_index * ctx.db.offset_size(); + + let hashes_buf: &mut Vec = &mut ctx.restore_context.block_hashes_buffer.borrow_mut(); + let binary_hashes_len = { + hashes_buf.clear(); + ctx.db + .get_content_block(main_hash, hashes_buf) + .wrap_err_with(|| format!("get main content block: {}", main_hash))? + }; + + let _len = binary_hashes_len.ok_or_else(|| { + eyre!( + "Failed to find blocklist {} for {:?}", + main_hash, + ctx.absolute_path, + ) + })?; + + let mut last_block_size = None; + for (bi, bhash) in hashes_buf.chunks(ctx.db.hash_size()).enumerate() { + restore_file_multiblock_block(ctx, bi, bhash, blockhashoffset, &mut last_block_size)?; + } + + Ok(()) +} + +fn check_file_hash(ctx: &RestoreFileContext<'_>) -> Result<()> { + if ctx.size == 0 { + return Ok(()); + } + let hasher = { + let mut hasher = None; + std::mem::swap(&mut hasher, &mut ctx.hasher.borrow_mut()); + + hasher + }; + if hasher.is_none() { + return Ok(()); + } + let hasher = hasher.unwrap(); + + let calculated_hash: &[u8] = &hasher.finalize()[..]; + let expected_hash = ctx.hash.hash.as_slice(); + if expected_hash != calculated_hash { + return Err(eyre!( + "hash is invalid: expected != calculated, {} != {}", + HexDisplayBytes(expected_hash), + HexDisplayBytes(calculated_hash) + )); + } + let debug_hash = false; + if debug_hash { + println!( + "hash is valid {} == {}", + HexDisplayBytes(expected_hash), + HexDisplayBytes(calculated_hash) + ); + } + + Ok(()) +} diff --git a/src/sorting.rs b/src/sorting.rs new file mode 100644 index 0000000..091c048 --- /dev/null +++ b/src/sorting.rs @@ -0,0 +1,34 @@ +use crate::{ + database::DFileDatabase, dfileentry::FileEntry, dfiletype::FileType, ziparchive::BlockLocation, +}; +use std::cmp::Ordering; + +/// Not necessary, but useful to speed up file reads from HDD +/// from like 200 Mbit/s to 700 Mbit/s +pub fn sort_files_sequentially(file_entries: &mut [FileEntry], dblock_db: &DFileDatabase) { + file_entries.sort_by(|a, b| compare_fileentry(a, b, dblock_db)); +} + +/// Optional. Used for sorting. +pub fn get_first_bytes_location(entry: &FileEntry, db: &DFileDatabase) -> Option { + match &entry.file_type { + FileType::File { hash, .. } => { + if entry.block_lists.is_empty() { + db.get_block_id_location(hash) + } else { + let first = entry.block_lists.first(); + + first.and_then(|bid| db.get_block_id_location(bid)) + } + } + _ => None, + } +} + +/// Optional. Used for sorting. +pub fn compare_fileentry(entry_a: &FileEntry, entry_b: &FileEntry, db: &DFileDatabase) -> Ordering { + let a = get_first_bytes_location(entry_a, db); + let b = get_first_bytes_location(entry_b, db); + + a.cmp(&b).then_with(|| entry_a.cmp(entry_b)) +} diff --git a/src/stripbom.rs b/src/stripbom.rs new file mode 100644 index 0000000..51fa4dc --- /dev/null +++ b/src/stripbom.rs @@ -0,0 +1,74 @@ +use eyre::Result; +use std::io::{BufRead, Read}; + +pub trait StripBom { + fn strip_bom(&self) -> &str; +} + +impl StripBom for str { + fn strip_bom(&self) -> &str { + if let Some(stripped) = self.strip_prefix('\u{feff}') { + stripped + } else { + self + } + } +} + +impl StripBom for String { + fn strip_bom(&self) -> &str { + self[..].strip_bom() + } +} + +fn starts_with_bom_bytes(s: &[u8]) -> bool { + s.starts_with(&[0xEF, 0xBB, 0xBF]) +} + +pub trait StripBomBytes { + fn strip_bom(&self) -> &[u8]; +} + +impl StripBomBytes for [u8] { + fn strip_bom(&self) -> &[u8] { + if let Some(stripped) = self.strip_prefix(&[0xEF, 0xBB, 0xBF]) { + stripped + } else { + self + } + } +} + +pub fn strip_bom_from_bufread(mut inner: R) -> Result<()> { + let buf = inner.fill_buf()?; + if buf.len() >= 3 && starts_with_bom_bytes(buf) { + //println!("removing bom"); + inner.consume(3); + } + + // here we ignore case of 1 or 2 bytes of BOM + Ok(()) +} + +pub struct StripBomReader { + pub first_bytes: bool, + pub inner: R, +} + +impl StripBomReader { + #[allow(unused)] + pub fn new(mut inner: R) -> Result> { + strip_bom_from_bufread(&mut inner)?; + + Ok(Self { + first_bytes: true, + inner, + }) + } +} + +impl Read for StripBomReader { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + self.inner.read(buf) + } +} diff --git a/src/ziparchive.rs b/src/ziparchive.rs new file mode 100644 index 0000000..0fe326c --- /dev/null +++ b/src/ziparchive.rs @@ -0,0 +1,141 @@ +use eyre::Result; +use std::{ + fs::File, + io::{BufRead, BufReader, IoSliceMut, Read, Seek, SeekFrom}, + path::PathBuf, + sync::{atomic::AtomicU32, Arc}, +}; +use zip::ZipArchive; + +#[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)] +/// Path to dblock.zip +pub struct ZipLocation { + // pub path_str: String, + pub path: PathBuf, +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct BlockLocation { + /// Which dblock.zip file + pub ziplocation: Arc, + + /// Which file inside the zip + pub file_index: u32, +} + +impl Ord for BlockLocation { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + // First zip_path (which dblock.zip it is) + // then file_index inside the ZIP file + self.ziplocation + .cmp(&other.ziplocation) + .then_with(|| self.file_index.cmp(&other.file_index)) + } +} + +impl PartialOrd for BlockLocation { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +pub struct ZipArchiveWrapper { + pub ziplocation: Arc, + pub archive: ZipArchive, +} + +impl ZipArchiveWrapper { + pub fn get_block_location(&self, block_base64: &str) -> Option { + self.archive + .get_file_index(block_base64) + .map(|index| BlockLocation { + file_index: index as u32, + ziplocation: self.ziplocation.clone(), + }) + } + + pub fn contains_file_name(&self, block_base64: &str) -> bool { + self.archive.contains_file_name(block_base64) + } +} + +pub struct MyCloneFileConfig { + pub path: PathBuf, + /// Changes after the files are indexed. + /// Bigger buf helps with large file reads. + /// Smaller buf does less redundant byte reads from disk when indexing. + pub buf_capacity: AtomicU32, +} + +/// Used to share ZipArchive across many threads +/// +/// Multiple ZipArchive structs would allocate too much Vec in rayon threads +/// +/// Therefore we open file again on every .clone() +pub struct MyCloneFileReader { + pub config: Arc, + buf_reader: BufReader, +} + +impl Clone for MyCloneFileReader { + fn clone(&self) -> Self { + Self::new(self.config.clone()).unwrap() + } +} + +impl MyCloneFileReader { + pub fn new(config: Arc) -> Result { + let target_file = File::open(&config.path)?; + let cap = config + .buf_capacity + .load(std::sync::atomic::Ordering::Relaxed); + let filebuf = BufReader::with_capacity(cap as usize, target_file); + + Ok(Self { + config, + buf_reader: filebuf, + }) + } +} + +impl Read for MyCloneFileReader { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + self.buf_reader.read(buf) + } + + fn read_exact(&mut self, buf: &mut [u8]) -> std::io::Result<()> { + self.buf_reader.read_exact(buf) + } + + fn read_vectored(&mut self, bufs: &mut [IoSliceMut<'_>]) -> std::io::Result { + self.buf_reader.read_vectored(bufs) + } + + fn read_to_end(&mut self, buf: &mut Vec) -> std::io::Result { + self.buf_reader.read_to_end(buf) + } + + fn read_to_string(&mut self, buf: &mut String) -> std::io::Result { + self.buf_reader.read_to_string(buf) + } +} + +impl Seek for MyCloneFileReader { + fn seek(&mut self, pos: SeekFrom) -> std::io::Result { + self.buf_reader.seek(pos) + } + + fn stream_position(&mut self) -> std::io::Result { + self.buf_reader.stream_position() + } +} + +impl BufRead for MyCloneFileReader { + fn fill_buf(&mut self) -> std::io::Result<&[u8]> { + self.buf_reader.fill_buf() + } + + fn consume(&mut self, amt: usize) { + self.buf_reader.consume(amt) + } +}