diff --git a/.github/workflows/dependency_modification_check.yml b/.github/workflows/dependency_modification_check.yml deleted file mode 100644 index ac6537af102..00000000000 --- a/.github/workflows/dependency_modification_check.yml +++ /dev/null @@ -1,17 +0,0 @@ -name: Check no dependencies were modified - -on: pull_request - -jobs: - dependency_changed_check: - runs-on: ubuntu-latest - steps: - - name: "Checkout repository" - uses: actions/checkout@v3 - with: - ref: ${{ github.event.pull_request.head.sha }} - - - name: "Check Cargo.lock not in changeset" - run: | - git fetch origin - git diff origin/$GITHUB_BASE_REF.. --name-only| ( ! grep "Cargo.lock") diff --git a/.gitignore b/.gitignore index 155e4cbd8a8..f56db437d09 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,4 @@ test_results/* /resources/linux /resources/x86_64 /resources/aarch64 +.env \ No newline at end of file diff --git a/.tool-versions b/.tool-versions new file mode 100644 index 00000000000..ff8f8f8c879 --- /dev/null +++ b/.tool-versions @@ -0,0 +1,2 @@ +gcloud 534.0.0 +rust 1.79.0 diff --git a/Cargo.lock b/Cargo.lock index 9ad999e44d0..cd65527cfa1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -622,7 +622,7 @@ dependencies = [ "serde_json", "thiserror", "timerfd", - "userfaultfd", + "userfaultfd 0.8.1", "utils", "vmm", "vmm-sys-util", @@ -1456,7 +1456,20 @@ dependencies = [ "libc", "nix 0.27.1", "thiserror", - "userfaultfd-sys", + "userfaultfd-sys 0.5.0", +] + +[[package]] +name = "userfaultfd" +version = "0.9.0" +source = "git+https://github.com/e2b-dev/userfaultfd-rs?branch=feat_write_protection#9f4f7b42adbb9bea59016f4af248ed547cf160f0" +dependencies = [ + "bitflags 2.6.0", + "cfg-if", + "libc", + "nix 0.27.1", + "thiserror", + "userfaultfd-sys 0.6.0", ] [[package]] @@ -1470,6 +1483,16 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "userfaultfd-sys" +version = "0.6.0" +source = "git+https://github.com/e2b-dev/userfaultfd-rs?branch=feat_write_protection#9f4f7b42adbb9bea59016f4af248ed547cf160f0" +dependencies = [ + "bindgen 0.69.4", + "cc", + "cfg-if", +] + [[package]] name = "utf8parse" version = "0.2.2" @@ -1602,7 +1625,7 @@ dependencies = [ "slab", "thiserror", "timerfd", - "userfaultfd", + "userfaultfd 0.9.0", "utils", "vhost", "vm-allocator", diff --git a/Makefile b/Makefile new file mode 100644 index 00000000000..e96fce61f33 --- /dev/null +++ b/Makefile @@ -0,0 +1,12 @@ +-include .env + +.PHONY: build +build: + ./scripts/build.sh + +.PHONY: upload +upload: + ./scripts/upload.sh $(GCP_PROJECT_ID) + +.PHONY: build-and-upload +build-and-upload: build upload diff --git a/resources/seccomp/aarch64-unknown-linux-musl.json b/resources/seccomp/aarch64-unknown-linux-musl.json index 48d94a0f050..b04d2886a35 100644 --- a/resources/seccomp/aarch64-unknown-linux-musl.json +++ b/resources/seccomp/aarch64-unknown-linux-musl.json @@ -220,6 +220,10 @@ "syscall": "madvise", "comment": "Used by the VirtIO balloon device and by musl for some customer workloads. It is also used by aws-lc during random number generation. They setup a memory page that mark with MADV_WIPEONFORK to be able to detect forks. They also call it with -1 to see if madvise is supported in certain platforms." }, + { + "syscall": "mincore", + "comment": "Used by get_memory_dirty_bitmap to check if memory pages are resident" + }, { "syscall": "mmap", "comment": "Used by the VirtIO balloon device", diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json index 861b69c6b44..455572fa7b8 100644 --- a/resources/seccomp/x86_64-unknown-linux-musl.json +++ b/resources/seccomp/x86_64-unknown-linux-musl.json @@ -216,6 +216,14 @@ "syscall": "madvise", "comment": "Used by the VirtIO balloon device and by musl for some customer workloads. It is also used by aws-lc during random number generation. They setup a memory page that mark with MADV_WIPEONFORK to be able to detect forks. They also call it with -1 to see if madvise is supported in certain platforms." }, + { + "syscall": "mincore", + "comment": "Used by get_memory_dirty_bitmap to check if memory pages are resident" + }, + { + "syscall": "pread64", + "comment": "Used by get_dirty_memory to read pagemap entries" + }, { "syscall": "mmap", "comment": "Used by the VirtIO balloon device", @@ -524,8 +532,8 @@ "comment": "sigaltstack is used by Rust stdlib to remove alternative signal stack during thread teardown." }, { - "syscall": "getrandom", - "comment": "getrandom is used by `HttpServer` to reinialize `HashMap` after moving to the API thread" + "syscall": "getrandom", + "comment": "getrandom is used by `HttpServer` to reinialize `HashMap` after moving to the API thread" }, { "syscall": "accept4", @@ -1276,4 +1284,4 @@ } ] } -} +} \ No newline at end of file diff --git a/scripts/build.sh b/scripts/build.sh new file mode 100755 index 00000000000..6b831c9b99f --- /dev/null +++ b/scripts/build.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +set -euo pipefail + +# The format will be: v.._ — e.g. v1.7.2_8bb88311 +# Extract full version from src/firecracker/swagger/firecracker.yaml +FC_VERSION=$(awk '/^info:/{flag=1} flag && /^ version:/{print $2; exit}' src/firecracker/swagger/firecracker.yaml) +commit_hash=$(git rev-parse --short=7 HEAD) +version_name="v${FC_VERSION}_${commit_hash}" +echo "Version name: $version_name" + +echo "Starting to build Firecracker version: $version_name" +tools/devtool -y build --release + +mkdir -p "./build/fc/${version_name}" +cp ./build/cargo_target/x86_64-unknown-linux-musl/release/firecracker "./build/fc/${version_name}/firecracker" +echo "Finished building Firecracker version: $version_name and copied to ./build/fc/${version_name}/firecracker" diff --git a/scripts/upload.sh b/scripts/upload.sh new file mode 100755 index 00000000000..4227c642593 --- /dev/null +++ b/scripts/upload.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +set -euo pipefail + +GCP_PROJECT_ID=$1 + +gsutil -h "Cache-Control:no-cache, max-age=0" cp -r "build/fc/*" "gs://${GCP_PROJECT_ID}-fc-versions" +if [ "$GCP_PROJECT_ID" == "e2b-prod" ]; then + # Upload kernel to GCP public builds bucket + gsutil -h "Cache-Control:no-cache, max-age=0" cp -r "build/fc/*" "gs://${GCP_PROJECT_ID}-public-builds/firecrackers/" +fi + +rm -rf build/fc/* diff --git a/src/cpu-template-helper/src/utils/mod.rs b/src/cpu-template-helper/src/utils/mod.rs index bd570840fc5..f457ca0b872 100644 --- a/src/cpu-template-helper/src/utils/mod.rs +++ b/src/cpu-template-helper/src/utils/mod.rs @@ -125,6 +125,7 @@ pub fn build_microvm_from_config( state: VmState::NotStarted, vmm_version: CPU_TEMPLATE_HELPER_VERSION.to_string(), app_name: "cpu-template-helper".to_string(), + memory_regions: None, }; let mut vm_resources = VmResources::from_json(&config, &instance_info, HTTP_MAX_PAYLOAD_SIZE, None) diff --git a/src/firecracker/Cargo.toml b/src/firecracker/Cargo.toml index adc0cb1ff83..9395fc508ed 100644 --- a/src/firecracker/Cargo.toml +++ b/src/firecracker/Cargo.toml @@ -35,7 +35,10 @@ vmm-sys-util = { version = "0.12.1", features = ["with-serde"] } [dev-dependencies] cargo_toml = "0.20.5" libc = "0.2.161" -regex = { version = "1.11.1", default-features = false, features = ["std", "unicode-perl"] } +regex = { version = "1.11.1", default-features = false, features = [ + "std", + "unicode-perl", +] } # Dev-Dependencies for uffd examples serde = { version = "1.0.214", features = ["derive"] } @@ -48,7 +51,12 @@ serde = { version = "1.0.214" } serde_json = "1.0.132" [features] -tracing = ["log-instrument", "seccompiler/tracing", "utils/tracing", "vmm/tracing"] +tracing = [ + "log-instrument", + "seccompiler/tracing", + "utils/tracing", + "vmm/tracing", +] gdb = ["vmm/gdb"] [lints] diff --git a/src/firecracker/src/api_server/mod.rs b/src/firecracker/src/api_server/mod.rs index 6ac2955af8f..85b6358b871 100644 --- a/src/firecracker/src/api_server/mod.rs +++ b/src/firecracker/src/api_server/mod.rs @@ -274,7 +274,7 @@ mod tests { Box::new(VmmAction::CreateSnapshot(CreateSnapshotParams { snapshot_type: SnapshotType::Diff, snapshot_path: PathBuf::new(), - mem_file_path: PathBuf::new(), + mem_file_path: Some(PathBuf::new()), })), start_time_us, ); @@ -287,7 +287,7 @@ mod tests { Box::new(VmmAction::CreateSnapshot(CreateSnapshotParams { snapshot_type: SnapshotType::Diff, snapshot_path: PathBuf::new(), - mem_file_path: PathBuf::new(), + mem_file_path: Some(PathBuf::new()), })), start_time_us, ); diff --git a/src/firecracker/src/api_server/parsed_request.rs b/src/firecracker/src/api_server/parsed_request.rs index 125463d1d05..ef629a398a4 100644 --- a/src/firecracker/src/api_server/parsed_request.rs +++ b/src/firecracker/src/api_server/parsed_request.rs @@ -20,6 +20,7 @@ use super::request::logger::parse_put_logger; use super::request::machine_configuration::{ parse_get_machine_config, parse_patch_machine_config, parse_put_machine_config, }; +use super::request::memory::{parse_get_memory, parse_get_memory_dirty, parse_get_memory_mappings}; use super::request::metrics::parse_put_metrics; use super::request::mmds::{parse_get_mmds, parse_patch_mmds, parse_put_mmds}; use super::request::net::{parse_patch_net, parse_put_net}; @@ -82,6 +83,15 @@ impl TryFrom<&Request> for ParsedRequest { Ok(ParsedRequest::new_sync(VmmAction::GetFullVmConfig)) } (Method::Get, "machine-config", None) => parse_get_machine_config(), + (Method::Get, "memory", None) => match path_tokens.next() { + Some("mappings") => parse_get_memory_mappings(), + Some("dirty") => parse_get_memory_dirty(), + None => parse_get_memory(), + _ => Err(RequestError::InvalidPathMethod( + request_uri.to_string(), + Method::Get, + )), + }, (Method::Get, "mmds", None) => parse_get_mmds(), (Method::Get, _, Some(_)) => method_to_error(Method::Get), (Method::Put, "actions", Some(body)) => parse_put_actions(body), @@ -172,6 +182,9 @@ impl ParsedRequest { } VmmData::BalloonStats(stats) => Self::success_response_with_data(stats), VmmData::InstanceInformation(info) => Self::success_response_with_data(info), + VmmData::MemoryMappings(mappings) => Self::success_response_with_data(mappings), + VmmData::Memory(memory) => Self::success_response_with_data(memory), + VmmData::MemoryDirty(dirty) => Self::success_response_with_data(dirty), VmmData::VmmVersion(version) => Self::success_response_with_data( &serde_json::json!({ "firecracker_version": version.as_str() }), ), @@ -568,6 +581,15 @@ pub mod tests { VmmData::InstanceInformation(info) => { http_response(&serde_json::to_string(info).unwrap(), 200) } + VmmData::MemoryMappings(mappings) => { + http_response(&serde_json::to_string(mappings).unwrap(), 200) + } + VmmData::Memory(memory) => { + http_response(&serde_json::to_string(memory).unwrap(), 200) + } + VmmData::MemoryDirty(dirty) => { + http_response(&serde_json::to_string(dirty).unwrap(), 200) + } VmmData::VmmVersion(version) => http_response( &serde_json::json!({ "firecracker_version": version.as_str() }).to_string(), 200, @@ -589,6 +611,18 @@ pub mod tests { verify_ok_response_with(VmmData::MachineConfiguration(MachineConfig::default())); verify_ok_response_with(VmmData::MmdsValue(serde_json::from_str("{}").unwrap())); verify_ok_response_with(VmmData::InstanceInformation(InstanceInfo::default())); + verify_ok_response_with(VmmData::MemoryMappings( + vmm::vmm_config::instance_info::MemoryMappingsResponse { mappings: vec![] }, + )); + verify_ok_response_with(VmmData::Memory( + vmm::vmm_config::instance_info::MemoryResponse { + resident: vec![], + empty: vec![], + }, + )); + verify_ok_response_with(VmmData::MemoryDirty( + vmm::vmm_config::instance_info::MemoryDirty { bitmap: vec![] }, + )); verify_ok_response_with(VmmData::VmmVersion(String::default())); // Error. @@ -662,6 +696,30 @@ pub mod tests { ParsedRequest::try_from(&req).unwrap(); } + #[test] + fn test_try_from_get_memory_mappings() { + let (mut sender, receiver) = UnixStream::pair().unwrap(); + let mut connection = HttpConnection::new(receiver); + sender + .write_all(http_request("GET", "/memory/mappings", None).as_bytes()) + .unwrap(); + connection.try_read().unwrap(); + let req = connection.pop_parsed_request().unwrap(); + ParsedRequest::try_from(&req).unwrap(); + } + + #[test] + fn test_try_from_get_memory() { + let (mut sender, receiver) = UnixStream::pair().unwrap(); + let mut connection = HttpConnection::new(receiver); + sender + .write_all(http_request("GET", "/memory", None).as_bytes()) + .unwrap(); + connection.try_read().unwrap(); + let req = connection.pop_parsed_request().unwrap(); + ParsedRequest::try_from(&req).unwrap(); + } + #[test] fn test_try_from_get_version() { let (mut sender, receiver) = UnixStream::pair().unwrap(); diff --git a/src/firecracker/src/api_server/request/memory.rs b/src/firecracker/src/api_server/request/memory.rs new file mode 100644 index 00000000000..54df9f3d2a7 --- /dev/null +++ b/src/firecracker/src/api_server/request/memory.rs @@ -0,0 +1,52 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use vmm::logger::{IncMetric, METRICS}; +use vmm::rpc_interface::VmmAction; + +use super::super::parsed_request::{ParsedRequest, RequestError}; + +pub(crate) fn parse_get_memory_mappings() -> Result { + METRICS.get_api_requests.instance_info_count.inc(); + Ok(ParsedRequest::new_sync(VmmAction::GetMemoryMappings)) +} + +pub(crate) fn parse_get_memory() -> Result { + METRICS.get_api_requests.instance_info_count.inc(); + Ok(ParsedRequest::new_sync(VmmAction::GetMemory)) +} + +pub(crate) fn parse_get_memory_dirty() -> Result { + METRICS.get_api_requests.instance_info_count.inc(); + Ok(ParsedRequest::new_sync(VmmAction::GetMemoryDirty)) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::api_server::parsed_request::RequestAction; + + #[test] + fn test_parse_get_memory_mappings_request() { + match parse_get_memory_mappings().unwrap().into_parts() { + (RequestAction::Sync(action), _) if *action == VmmAction::GetMemoryMappings => {} + _ => panic!("Test failed."), + } + } + + #[test] + fn test_parse_get_memory_request() { + match parse_get_memory().unwrap().into_parts() { + (RequestAction::Sync(action), _) if *action == VmmAction::GetMemory => {} + _ => panic!("Test failed."), + } + } + + #[test] + fn test_parse_get_memory_dirty_request() { + match parse_get_memory_dirty().unwrap().into_parts() { + (RequestAction::Sync(action), _) if *action == VmmAction::GetMemoryDirty => {} + _ => panic!("Test failed."), + } + } +} diff --git a/src/firecracker/src/api_server/request/mod.rs b/src/firecracker/src/api_server/request/mod.rs index 0c1622798f4..4442436986c 100644 --- a/src/firecracker/src/api_server/request/mod.rs +++ b/src/firecracker/src/api_server/request/mod.rs @@ -10,6 +10,7 @@ pub mod entropy; pub mod instance_info; pub mod logger; pub mod machine_configuration; +pub mod memory; pub mod metrics; pub mod mmds; pub mod net; diff --git a/src/firecracker/src/api_server/request/snapshot.rs b/src/firecracker/src/api_server/request/snapshot.rs index 8878c224b5c..448fa95ad48 100644 --- a/src/firecracker/src/api_server/request/snapshot.rs +++ b/src/firecracker/src/api_server/request/snapshot.rs @@ -139,7 +139,7 @@ mod tests { let expected_config = CreateSnapshotParams { snapshot_type: SnapshotType::Diff, snapshot_path: PathBuf::from("foo"), - mem_file_path: PathBuf::from("bar"), + mem_file_path: Some(PathBuf::from("bar")), }; assert_eq!( vmm_action_from_request(parse_put_snapshot(&Body::new(body), Some("create")).unwrap()), @@ -153,7 +153,7 @@ mod tests { let expected_config = CreateSnapshotParams { snapshot_type: SnapshotType::Full, snapshot_path: PathBuf::from("foo"), - mem_file_path: PathBuf::from("bar"), + mem_file_path: Some(PathBuf::from("bar")), }; assert_eq!( vmm_action_from_request(parse_put_snapshot(&Body::new(body), Some("create")).unwrap()), diff --git a/src/firecracker/src/main.rs b/src/firecracker/src/main.rs index 8fb5392afcf..1f0ec961ae8 100644 --- a/src/firecracker/src/main.rs +++ b/src/firecracker/src/main.rs @@ -337,6 +337,7 @@ fn main_exec() -> Result<(), MainError> { state: VmState::NotStarted, vmm_version: FIRECRACKER_VERSION.to_string(), app_name: "Firecracker".to_string(), + memory_regions: None, }; if let Some(metrics_path) = arguments.single_value("metrics-path") { diff --git a/src/firecracker/swagger/firecracker.yaml b/src/firecracker/swagger/firecracker.yaml index 1f2edb714b8..4200d4d0fab 100644 --- a/src/firecracker/swagger/firecracker.yaml +++ b/src/firecracker/swagger/firecracker.yaml @@ -618,6 +618,35 @@ paths: schema: $ref: "#/definitions/Error" + /memory/mappings: + get: + summary: Gets the memory mappings with skippable pages bitmap. + operationId: getMemoryMappings + responses: + 200: + description: OK + schema: + $ref: "#/definitions/MemoryMappingsResponse" + default: + description: Internal server error + schema: + $ref: "#/definitions/Error" + + /memory: + get: + summary: Gets the memory info (resident and empty pages). + description: Returns an object with resident and empty bitmaps. The resident bitmap marks all pages that are resident. The empty bitmap marks zero pages (subset of resident pages). This is checked at the pageSize of each region. All regions must have the same page size. + operationId: getMemory + responses: + 200: + description: OK + schema: + $ref: "#/definitions/MemoryResponse" + default: + description: Internal server error + schema: + $ref: "#/definitions/Error" + /version: get: summary: Gets the Firecracker version. @@ -991,6 +1020,59 @@ definitions: description: MicroVM hypervisor build version. type: string + GuestMemoryRegionMapping: + type: object + description: Describes the region of guest memory that can be used for creating the memfile. + required: + - base_host_virt_addr + - size + - offset + - page_size + properties: + base_host_virt_addr: + type: integer + size: + description: The size of the region in bytes. + type: integer + offset: + description: The offset of the region in bytes. + type: integer + page_size: + description: The page size in bytes. + type: integer + + MemoryMappingsResponse: + type: object + description: Response containing memory region mappings. + required: + - mappings + properties: + mappings: + type: array + description: The memory region mappings. + items: + $ref: "#/definitions/GuestMemoryRegionMapping" + + MemoryResponse: + type: object + description: Response containing the memory info (resident and empty pages). + required: + - resident + - empty + properties: + resident: + type: array + description: The resident bitmap as a vector of u64 values. Each bit represents if the page is resident. + items: + type: integer + format: uint64 + empty: + type: array + description: The empty bitmap as a vector of u64 values. Each bit represents if the page is zero (empty). This is a subset of the resident pages. + items: + type: integer + format: uint64 + Logger: type: object description: @@ -1192,7 +1274,6 @@ definitions: SnapshotCreateParams: type: object required: - - mem_file_path - snapshot_path properties: mem_file_path: diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index c9a032edb95..afef9deac59 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -9,15 +9,18 @@ license = "Apache-2.0" bench = false [dependencies] -acpi_tables = { path = "../acpi-tables" } -aes-gcm = { version = "0.10.1", default-features = false, features = ["aes"] } +acpi_tables = { path = "../acpi-tables" } +aes-gcm = { version = "0.10.1", default-features = false, features = ["aes"] } arrayvec = { version = "0.7.6", optional = true } aws-lc-rs = { version = "1.10.0", features = ["bindgen"] } base64 = "0.22.1" bincode = "1.2.1" bitflags = "2.6.0" crc64 = "2.0.0" -derive_more = { version = "1.0.0", default-features = false, features = ["from", "display"] } +derive_more = { version = "1.0.0", default-features = false, features = [ + "from", + "display", +] } displaydoc = "0.2.5" event-manager = "0.4.0" gdbstub = { version = "0.7.3", optional = true } @@ -39,11 +42,18 @@ serde_json = "1.0.132" slab = "0.4.7" thiserror = "1.0.67" timerfd = "1.5.0" -userfaultfd = "0.8.1" +userfaultfd = { git = "https://github.com/e2b-dev/userfaultfd-rs", branch = "feat_write_protection", features = [ + "linux5_7", + "linux5_13", + "linux6_7" +] } utils = { path = "../utils" } vhost = { version = "0.13.0", features = ["vhost-user-frontend"] } vm-allocator = "0.1.0" -vm-memory = { version = "0.16.0", features = ["backend-mmap", "backend-bitmap"] } +vm-memory = { version = "0.16.0", features = [ + "backend-mmap", + "backend-bitmap", +] } vm-superio = "0.8.0" vmm-sys-util = { version = "0.12.1", features = ["with-serde"] } zerocopy = { version = "0.8.8" } diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index c80f004e789..96288903af3 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -153,6 +153,7 @@ use crate::vstate::memory::{ use crate::vstate::vcpu::VcpuState; pub use crate::vstate::vcpu::{Vcpu, VcpuConfig, VcpuEvent, VcpuHandle, VcpuResponse}; pub use crate::vstate::vm::Vm; +use serde::{Deserialize, Serialize}; /// Shorthand type for the EventManager flavour used by Firecracker. pub type EventManager = BaseEventManager>>; @@ -191,6 +192,20 @@ pub enum FcExitCode { ArgParsing = 153, } +/// Describes the region of guest memory that can be used for creating the memfile. +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Deserialize, Serialize)] +pub struct GuestMemoryRegionMapping { + /// Base host virtual address where the guest memory contents for this region + /// should be copied/populated. + pub base_host_virt_addr: u64, + /// Region size. + pub size: usize, + /// Offset in the backend file/buffer where the region contents are. + pub offset: u64, + /// The configured page size for this memory region. + pub page_size: usize, +} + /// Timeout used in recv_timeout, when waiting for a vcpu response on /// Pause/Resume/Save/Restore. A high enough limit that should not be reached during normal usage, /// used to detect a potential vcpu deadlock. @@ -261,6 +276,8 @@ pub enum VmmError { VmmObserverTeardown(vmm_sys_util::errno::Error), /// VMGenID error: {0} VMGenID(#[from] VmGenIdError), + /// Pagemap error: {0} + Pagemap(#[from] utils::pagemap::PagemapError), } /// Shorthand type for KVM dirty page bitmap. @@ -451,6 +468,89 @@ impl Vmm { &self.guest_memory } + /// Returns the memory mappings for the guest memory. + pub fn guest_memory_mappings(&self, vm_info: &VmInfo) -> Vec { + let mut offset = 0; + let mut mappings = Vec::new(); + for mem_region in self.guest_memory().iter() { + mappings.push(GuestMemoryRegionMapping { + base_host_virt_addr: mem_region.as_ptr() as u64, + size: mem_region.size(), + offset, + page_size: vm_info.huge_pages.page_size_kib(), + }); + offset += mem_region.size() as u64; + } + mappings + } + + /// Get dirty pages bitmap for guest memory. + /// + /// Returns a bitmap where each bit represents whether a guest page has been written to + /// (i.e., present in RAM and not write-protected via userfaultfd). Pages are ordered + /// following the order of memory regions as returned by `guest_memory_mappings`. + pub fn get_dirty_memory(&self, page_size: usize) -> Result, VmmError> { + let pagemap = utils::pagemap::PagemapReader::new(page_size)?; + let mut dirty_bitmap = vec![]; + + let sys_page_size = utils::get_page_size().expect("Failed to get system page size"); + + for region in self.guest_memory().iter() { + let base_addr = region.as_ptr() as usize; + let len = region.size(); + let nr_pages = len / page_size; + + // Use mincore to get resident pages at guest page size granularity + let mincore_n = len.div_ceil(sys_page_size); + let mut mincore_vec = vec![0u8; mincore_n]; + + // SAFETY: base_addr points to a valid guest memory region we own. + let mincore_result = unsafe { + libc::mincore( + base_addr as *mut libc::c_void, + len, + mincore_vec.as_mut_ptr(), + ) + }; + + // TODO: if we don't support UFFD/async WP, we can completely skip this bit, as the + // UFFD handler already tracks dirty pages through the WriteProtected events. For the + // time being, we always do. + // + // Build dirty bitmap: check pagemap only for pages that mincore reports resident. + let mut slot_bitmap = vec![0u64; nr_pages.div_ceil(64)]; + for page_idx in 0..nr_pages { + let page_offset = page_idx * page_size; + + let is_resident = if mincore_result == 0 { + let start = page_offset / sys_page_size; + let count = page_size.div_ceil(sys_page_size); + if start + count <= mincore_vec.len() { + mincore_vec[start..start + count] + .iter() + .any(|&v| (v & 0x1) != 0) + } else { + false + } + } else { + // If mincore failed, assume resident (conservative) + true + }; + + if is_resident { + let virt_addr = base_addr + page_offset; + if pagemap.is_page_dirty(virt_addr)? { + slot_bitmap[page_idx / 64] |= 1u64 << (page_idx % 64); + } + } + } + + dirty_bitmap.extend_from_slice(&slot_bitmap); + } + + Ok(dirty_bitmap) + } + /// Sets RDA bit in serial console pub fn emulate_serial_init(&self) -> Result<(), EmulateSerialInitError> { // When restoring from a previously saved state, there is no serial diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 16d7ed72537..358050da7e8 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -14,7 +14,7 @@ use std::sync::{Arc, Mutex}; use seccompiler::BpfThreadMap; use semver::Version; use serde::{Deserialize, Serialize}; -use userfaultfd::{FeatureFlags, Uffd, UffdBuilder}; +use userfaultfd::{FeatureFlags, RegisterMode, Uffd, UffdBuilder}; use vmm_sys_util::sock_ctrl_msg::ScmSocket; #[cfg(target_arch = "aarch64")] @@ -171,7 +171,10 @@ pub fn create_snapshot( snapshot_state_to_file(µvm_state, ¶ms.snapshot_path)?; - snapshot_memory_to_file(vmm, ¶ms.mem_file_path, params.snapshot_type)?; + // Dump memory to file only if mem_file_path is specified + if let Some(ref mem_file_path) = params.mem_file_path { + snapshot_memory_to_file(vmm, mem_file_path, params.snapshot_type)?; + } Ok(()) } @@ -533,6 +536,8 @@ pub enum GuestMemoryFromUffdError { Create(userfaultfd::Error), /// Failed to register memory address range with the userfaultfd object: {0} Register(userfaultfd::Error), + /// Failed to enable write protection on memory address range with the userfaultfd object: {0} + WriteProtect(userfaultfd::Error), /// Failed to connect to UDS Unix stream: {0} Connect(#[from] std::io::Error), /// Failed to sends file descriptor: {0} @@ -557,6 +562,10 @@ fn guest_memory_from_uffd( uffd_builder.require_features(FeatureFlags::EVENT_REMOVE); } + uffd_builder.require_features( + FeatureFlags::MISSING_HUGETLBFS | FeatureFlags::WP_ASYNC, + ); + let uffd = uffd_builder .close_on_exec(true) .non_blocking(true) @@ -565,8 +574,22 @@ fn guest_memory_from_uffd( .map_err(GuestMemoryFromUffdError::Create)?; for mem_region in guest_memory.iter() { - uffd.register(mem_region.as_ptr().cast(), mem_region.size() as _) - .map_err(GuestMemoryFromUffdError::Register)?; + uffd.register_with_mode( + mem_region.as_ptr().cast(), + mem_region.size() as _, + RegisterMode::MISSING | RegisterMode::WRITE_PROTECT, + ) + .map_err(GuestMemoryFromUffdError::Register)?; + + // If memory is backed by huge pages, we can immediately write protect it. + // Otherwise (memory is backed by anonymous memory), write protecting here + // won't have any effect, as the write-protection bit for a page will be + // wiped when the first page fault occurs. These cases need to be handled + // directly from the UFFD handler. + if huge_pages.is_hugetlbfs() { + uffd.write_protect(mem_region.as_ptr().cast(), mem_region.size() as _) + .map_err(GuestMemoryFromUffdError::WriteProtect)?; + } } send_uffd_handshake(mem_uds_path, &backend_mappings, &uffd)?; diff --git a/src/vmm/src/rpc_interface.rs b/src/vmm/src/rpc_interface.rs index 566228fd53a..42270c89161 100644 --- a/src/vmm/src/rpc_interface.rs +++ b/src/vmm/src/rpc_interface.rs @@ -25,7 +25,9 @@ use crate::vmm_config::balloon::{ use crate::vmm_config::boot_source::{BootSourceConfig, BootSourceConfigError}; use crate::vmm_config::drive::{BlockDeviceConfig, BlockDeviceUpdateConfig, DriveError}; use crate::vmm_config::entropy::{EntropyDeviceConfig, EntropyDeviceError}; -use crate::vmm_config::instance_info::InstanceInfo; +use crate::vmm_config::instance_info::{ + InstanceInfo, MemoryDirty, MemoryMappingsResponse, MemoryResponse, VmState, +}; use crate::vmm_config::machine_config::{MachineConfig, MachineConfigUpdate, VmConfigError}; use crate::vmm_config::metrics::{MetricsConfig, MetricsConfigError}; use crate::vmm_config::mmds::{MmdsConfig, MmdsConfigError}; @@ -65,6 +67,12 @@ pub enum VmmAction { GetVmMachineConfig, /// Get microVM instance information. GetVmInstanceInfo, + /// Get memory mappings with skippable pages bitmap. + GetMemoryMappings, + /// Get memory info (resident and empty pages). + GetMemory, + /// Get guest memory dirty pages information + GetMemoryDirty, /// Get microVM version. GetVmmVersion, /// Flush the metrics. This action can only be called after the logger has been configured. @@ -164,6 +172,8 @@ pub enum VmmActionError { OperationNotSupportedPostBoot, /// The requested operation is not supported before starting the microVM. OperationNotSupportedPreBoot, + /// The requested operation is not supported while the microVM is running. + OperationNotSupportedWhileRunning, /// Start microvm error: {0} StartMicrovm(#[from] StartMicrovmError), /// Vsock config error: {0} @@ -189,6 +199,12 @@ pub enum VmmData { MmdsValue(serde_json::Value), /// The microVM instance information. InstanceInformation(InstanceInfo), + /// Memory mappings with skippable pages bitmap. + MemoryMappings(MemoryMappingsResponse), + /// Memory info (resident and empty pages). + Memory(MemoryResponse), + /// The guest memory dirty pages information + MemoryDirty(MemoryDirty), /// The microVM version. VmmVersion(String), } @@ -419,6 +435,9 @@ impl<'a> PrebootApiController<'a> { &self.vm_resources.vm_config, ))), GetVmInstanceInfo => Ok(VmmData::InstanceInformation(self.instance_info.clone())), + GetMemoryMappings | GetMemory | GetMemoryDirty => { + Err(VmmActionError::OperationNotSupportedPreBoot) + } GetVmmVersion => Ok(VmmData::VmmVersion(self.instance_info.vmm_version.clone())), InsertBlockDevice(config) => self.insert_block_device(config), InsertNetworkDevice(config) => self.insert_net_device(config), @@ -646,9 +665,32 @@ impl RuntimeApiController { GetVmMachineConfig => Ok(VmmData::MachineConfiguration(MachineConfig::from( &self.vm_resources.vm_config, ))), - GetVmInstanceInfo => Ok(VmmData::InstanceInformation( - self.vmm.lock().expect("Poisoned lock").instance_info(), - )), + GetVmInstanceInfo => { + let locked_vmm = self.vmm.lock().expect("Poisoned lock"); + let instance_info = locked_vmm.instance_info(); + Ok(VmmData::InstanceInformation(instance_info)) + } + GetMemoryMappings => { + let locked_vmm = self.vmm.lock().expect("Poisoned lock"); + let mappings = locked_vmm.guest_memory_mappings(&VmInfo::from(&self.vm_resources)); + + Ok(VmmData::MemoryMappings(MemoryMappingsResponse { mappings })) + } + GetMemory => { + let locked_vmm = self.vmm.lock().expect("Poisoned lock"); + let (resident_bitmap, empty_bitmap) = locked_vmm + .vm + .get_memory_info( + &locked_vmm.guest_memory(), + &VmInfo::from(&self.vm_resources), + ) + .map_err(|e| VmmActionError::InternalVmm(VmmError::Vm(e)))?; + Ok(VmmData::Memory(MemoryResponse { + resident: resident_bitmap, + empty: empty_bitmap, + })) + } + GetMemoryDirty => self.get_dirty_memory_info(), GetVmmVersion => Ok(VmmData::VmmVersion( self.vmm.lock().expect("Poisoned lock").version(), )), @@ -748,6 +790,28 @@ impl RuntimeApiController { .map_err(VmmActionError::InternalVmm) } + /// Get dirty pages information for guest memory + fn get_dirty_memory_info(&self) -> Result { + let start_us = get_time_us(ClockType::Monotonic); + let vmm = self.vmm.lock().expect("Poisoned lock"); + + // Dirty page tracking via pagemap requires the VM to be paused so that guest + // pages are not modified while we are reading the pagemap. + if vmm.instance_info.state != VmState::Paused { + return Err(VmmActionError::OperationNotSupportedWhileRunning); + } + + let page_size = self.vm_resources.vm_config.huge_pages.page_size_kib(); + let bitmap = vmm + .get_dirty_memory(page_size) + .map_err(VmmActionError::InternalVmm)?; + + let elapsed_time_us = get_time_us(ClockType::Monotonic) - start_us; + info!("'get dirty memory' VMM action took {elapsed_time_us} us."); + + Ok(VmmData::MemoryDirty(MemoryDirty { bitmap })) + } + fn create_snapshot( &mut self, create_params: &CreateSnapshotParams, @@ -1150,7 +1214,7 @@ mod tests { CreateSnapshotParams { snapshot_type: SnapshotType::Full, snapshot_path: PathBuf::new(), - mem_file_path: PathBuf::new(), + mem_file_path: Some(PathBuf::new()), }, ))); #[cfg(target_arch = "x86_64")] diff --git a/src/vmm/src/utils/mod.rs b/src/vmm/src/utils/mod.rs index a0ee2e90b6b..762aaa6bffb 100644 --- a/src/vmm/src/utils/mod.rs +++ b/src/vmm/src/utils/mod.rs @@ -9,6 +9,8 @@ pub mod net; pub mod signal; /// Module with state machine pub mod sm; +/// Module with pagemap utilities +pub mod pagemap; use std::num::Wrapping; use std::result::Result; diff --git a/src/vmm/src/utils/pagemap.rs b/src/vmm/src/utils/pagemap.rs new file mode 100644 index 00000000000..7cf626cb89c --- /dev/null +++ b/src/vmm/src/utils/pagemap.rs @@ -0,0 +1,111 @@ +//! Utilities for reading /proc/self/pagemap to track dirty pages. + +#![allow(clippy::cast_possible_wrap)] + +use std::fs::File; +use std::os::unix::io::AsRawFd; + +use crate::utils::get_page_size; + +const PAGEMAP_ENTRY_SIZE: usize = 8; + +/// Errors related to pagemap operations +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum PagemapError { + /// Failed to open /proc/self/pagemap: {0} + OpenPagemap(#[source] std::io::Error), + /// Failed to read pagemap entry: {0} + ReadEntry(#[source] std::io::Error), +} + +/// Represents a single entry in /proc/pid/pagemap. +/// +/// Each virtual page has an 8-byte entry with the following layout: +/// - Bits 0-54: Page frame number (PFN) if present +/// - Bit 55: Page is soft-dirty (written to since last clear) +/// - Bit 56: Page is exclusively mapped +/// - Bit 57: Page is write-protected via userfaultfd +/// - Bit 58: Unused +/// - Bit 59-60: Unused +/// - Bit 61: Page is file-page or shared-anon +/// - Bit 62: Page is swapped +/// - Bit 63: Page is present in RAM +#[derive(Debug, Clone, Copy)] +pub struct PagemapEntry { + raw: u64, +} + +impl PagemapEntry { + /// Create a PagemapEntry from bytes (little-endian) + pub fn from_bytes(bytes: [u8; 8]) -> Self { + Self { + raw: u64::from_ne_bytes(bytes), + } + } + + /// Check if page is write-protected via userfaultfd + pub fn is_write_protected(&self) -> bool { + (self.raw & (1u64 << 57)) != 0 + } + + /// Check if page is present in RAM (bit 63) + pub fn is_present(&self) -> bool { + (self.raw & (1u64 << 63)) != 0 + } +} + +/// Reader for /proc/self/pagemap +#[derive(Debug)] +pub struct PagemapReader { + pagemap_fd: File, +} + +impl PagemapReader { + /// Create a new PagemapReader + pub fn new(_page_size: usize) -> Result { + let pagemap_fd = File::open("/proc/self/pagemap").map_err(PagemapError::OpenPagemap)?; + + Ok(Self { pagemap_fd }) + } + + /// Check if a single page is dirty (write-protected bit cleared). + /// + /// Checks the first host page (4K) of the guest page at the given address. + /// For huge pages, all host pages within the huge page typically have the same + /// dirty status, so sampling the first is sufficient. + /// + /// # Arguments + /// * `virt_addr` - Virtual address of the page to check + /// + /// # Returns + /// True if the page is present and write-protected bit is cleared (dirty). + pub fn is_page_dirty(&self, virt_addr: usize) -> Result { + // Pagemap always uses host (4K) page size + let host_page_size = get_page_size().expect("Failed to get system page size"); + + // Calculate offset for this virtual page (using host page size) + let host_vpn = virt_addr / host_page_size; + let offset = (host_vpn * PAGEMAP_ENTRY_SIZE) as i64; + + let mut entry_bytes = [0u8; 8]; + + // SAFETY: pread is safe as long as the fd is valid and the buffer is properly sized + let ret = unsafe { + libc::pread( + self.pagemap_fd.as_raw_fd(), + entry_bytes.as_mut_ptr().cast(), + PAGEMAP_ENTRY_SIZE, + offset, + ) + }; + + if ret != PAGEMAP_ENTRY_SIZE as isize { + return Err(PagemapError::ReadEntry(std::io::Error::last_os_error())); + } + + let entry = PagemapEntry::from_bytes(entry_bytes); + + // Page must be present and the write_protected bit cleared (indicating it was written to) + Ok(entry.is_present() && !entry.is_write_protected()) + } +} diff --git a/src/vmm/src/vmm_config/instance_info.rs b/src/vmm/src/vmm_config/instance_info.rs index 67fd335deaa..c77c8d90cee 100644 --- a/src/vmm/src/vmm_config/instance_info.rs +++ b/src/vmm/src/vmm_config/instance_info.rs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use std::fmt::{self, Display, Formatter}; +use crate::GuestMemoryRegionMapping; use serde::{ser, Serialize}; /// Enumerates microVM runtime states. @@ -46,4 +47,32 @@ pub struct InstanceInfo { pub vmm_version: String, /// The name of the application that runs the microVM. pub app_name: String, + /// The regions of the guest memory. + #[serde(skip_serializing_if = "Option::is_none")] + pub memory_regions: Option>, +} + +/// Response structure for the memory mappings endpoint. +#[derive(Clone, Debug, PartialEq, Eq, Serialize)] +pub struct MemoryMappingsResponse { + /// The memory region mappings. + pub mappings: Vec, +} + +/// Response structure for the memory endpoint. +#[derive(Clone, Debug, PartialEq, Eq, Serialize)] +pub struct MemoryResponse { + /// The resident bitmap as a vector of u64 values. Each bit represents if the page is resident. + pub resident: Vec, + /// The empty bitmap as a vector of u64 values. Each bit represents if the page is zero (empty). + /// This is a subset of the resident pages. + pub empty: Vec, +} + +/// Information about dirty guest memory pages +#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize)] +pub struct MemoryDirty { + /// Bitmap for dirty pages. The bitmap is encoded as a vector of u64 values. + /// Each bit represents whether a page has been written since the last snapshot. + pub bitmap: Vec, } diff --git a/src/vmm/src/vmm_config/snapshot.rs b/src/vmm/src/vmm_config/snapshot.rs index e1850b74939..6ea0ee4a92a 100644 --- a/src/vmm/src/vmm_config/snapshot.rs +++ b/src/vmm/src/vmm_config/snapshot.rs @@ -44,7 +44,9 @@ pub struct CreateSnapshotParams { /// Path to the file that will contain the microVM state. pub snapshot_path: PathBuf, /// Path to the file that will contain the guest memory. - pub mem_file_path: PathBuf, + /// If not specified, the memory is not dumped to a file. + #[serde(skip_serializing_if = "Option::is_none")] + pub mem_file_path: Option, } /// Stores the configuration that will be used for loading a snapshot. diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 0f72abcf68f..d776b44689b 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -8,13 +8,12 @@ #[cfg(target_arch = "x86_64")] use std::fmt; -#[cfg(target_arch = "x86_64")] use kvm_bindings::{ - kvm_clock_data, kvm_irqchip, kvm_pit_config, kvm_pit_state2, CpuId, MsrList, - KVM_CLOCK_TSC_STABLE, KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE, - KVM_MAX_CPUID_ENTRIES, KVM_PIT_SPEAKER_DUMMY, + kvm_clock_data, kvm_irqchip, kvm_pit_config, kvm_pit_state2, kvm_userspace_memory_region, + CpuId, MsrList, KVM_API_VERSION, KVM_CLOCK_TSC_STABLE, KVM_IRQCHIP_IOAPIC, + KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE, KVM_MAX_CPUID_ENTRIES, KVM_MEM_LOG_DIRTY_PAGES, + KVM_PIT_SPEAKER_DUMMY, }; -use kvm_bindings::{kvm_userspace_memory_region, KVM_API_VERSION, KVM_MEM_LOG_DIRTY_PAGES}; use kvm_ioctls::{Kvm, VmFd}; use serde::{Deserialize, Serialize}; @@ -23,9 +22,16 @@ use crate::arch::aarch64::gic::GICDevice; #[cfg(target_arch = "aarch64")] use crate::arch::aarch64::gic::GicState; use crate::cpu_config::templates::KvmCapability; -#[cfg(target_arch = "x86_64")] -use crate::utils::u64_to_usize; +use crate::persist::VmInfo; +use crate::utils::{get_page_size, u64_to_usize}; use crate::vstate::memory::{Address, GuestMemory, GuestMemoryMmap, GuestMemoryRegion}; +use crate::GuestMemoryRegionMapping; + +/// Get the host page size in bytes. +/// This should always succeed on a valid system. +fn host_page_size() -> usize { + get_page_size().expect("Failed to get system page size") +} /// Errors associated with the wrappers over KVM ioctls. /// Needs `rustfmt::skip` to make multiline comments work @@ -77,6 +83,8 @@ pub enum VmError { #[cfg(target_arch = "aarch64")] /// Failed to restore the VM's GIC state: {0} RestoreGic(crate::arch::aarch64::gic::GicError), + /// Invalid memory configuration: {0} + InvalidMemoryConfiguration(String), } /// Error type for [`Vm::restore_state`] @@ -257,6 +265,136 @@ impl Vm { pub fn fd(&self) -> &VmFd { &self.fd } + + /// Returns the memory mappings for the guest memory. + pub fn guest_memory_mappings( + guest_memory: &GuestMemoryMmap, + vm_info: &VmInfo, + ) -> Vec { + let mut offset = 0; + let mut mappings = Vec::new(); + for mem_region in guest_memory.iter() { + mappings.push(GuestMemoryRegionMapping { + base_host_virt_addr: mem_region.as_ptr() as u64, + size: mem_region.size(), + offset, + page_size: vm_info.huge_pages.page_size_kib(), + }); + offset += mem_region.size() as u64; + } + mappings + } + + /// Gets the memory info (resident and empty pages) for all memory regions. + /// Returns two bitmaps: resident (all resident pages) and empty (zero pages, subset of resident). + /// This checks at the pageSize of each region and requires all regions to have the same page size. + pub fn get_memory_info( + &self, + guest_memory: &GuestMemoryMmap, + vm_info: &VmInfo, + ) -> Result<(Vec, Vec), VmError> { + let mappings = Self::guest_memory_mappings(guest_memory, vm_info); + + if mappings.is_empty() { + return Ok((Vec::new(), Vec::new())); + } + + // Check that all regions have the same page size + let page_size = mappings[0].page_size; + if mappings.iter().any(|m| m.page_size != page_size) { + return Err(VmError::InvalidMemoryConfiguration( + "All memory regions must have the same page size".to_string(), + )); + } + + // Calculate total number of pages across all regions + let total_pages: usize = mappings.iter().map(|m| m.size / page_size).sum(); + let bitmap_size = total_pages.div_ceil(64); + let mut resident_bitmap = vec![0u64; bitmap_size]; + let mut empty_bitmap = vec![0u64; bitmap_size]; + + let mut global_page_idx = 0; + + // SAFETY: We're reading from valid memory regions that we own + unsafe { + // Pre-allocate zero buffer once per page size (reused for all pages) + // This is the most important optimization - avoids repeated allocations + let zero_buf = vec![0u8; page_size]; + + let sys_page_size = host_page_size(); + + for mapping in &mappings { + // Find the memory region that matches this mapping + let mem_region = guest_memory + .iter() + .find(|region| region.as_ptr() as u64 == mapping.base_host_virt_addr) + .expect("Memory region not found for mapping"); + + let region_ptr = mem_region.as_ptr(); + let region_size = mem_region.size(); + let num_pages = region_size / page_size; + + // Use mincore on the entire region to check residency + let mincore_pages = region_size.div_ceil(sys_page_size); + let mut mincore_vec = vec![0u8; mincore_pages]; + + let mincore_result = libc::mincore( + region_ptr.cast::(), + region_size, + mincore_vec.as_mut_ptr(), + ); + + // Check each page + for page_idx in 0..num_pages { + let page_offset = page_idx * page_size; + let page_ptr = region_ptr.add(page_offset); + + // Check if page is resident using mincore + let is_resident = if mincore_result == 0 { + let page_mincore_start = page_offset / sys_page_size; + let page_mincore_count = page_size.div_ceil(sys_page_size); + if page_mincore_start + page_mincore_count <= mincore_vec.len() { + // Page is resident if any 4KB sub-page is resident (check LSB only) + mincore_vec[page_mincore_start..page_mincore_start + page_mincore_count] + .iter() + .any(|&v| (v & 0x1) != 0) + } else { + false + } + } else { + // If mincore failed, assume resident (conservative approach) + true + }; + + let bitmap_idx = global_page_idx / 64; + let bit_idx = global_page_idx % 64; + + if is_resident { + // Set bit in resident bitmap + if bitmap_idx < resident_bitmap.len() { + resident_bitmap[bitmap_idx] |= 1u64 << bit_idx; + } + + // Check if page is zero (empty) + let is_zero = libc::memcmp( + page_ptr.cast::(), + zero_buf.as_ptr().cast::(), + page_size, + ) == 0; + + // Set bit in empty bitmap if page is zero + if is_zero && bitmap_idx < empty_bitmap.len() { + empty_bitmap[bitmap_idx] |= 1u64 << bit_idx; + } + } + + global_page_idx += 1; + } + } + } + + Ok((resident_bitmap, empty_bitmap)) + } } #[cfg(target_arch = "aarch64")] @@ -296,7 +434,33 @@ impl Vm { }) } - /// Restore the KVM VM state + /// Resets the KVM dirty bitmap for each of the guest's memory regions. + pub fn reset_dirty_bitmap(&self, guest_memory: &GuestMemoryMmap) { + guest_memory.iter().zip(0u32..).for_each(|(region, slot)| { + let _ = self.fd().get_dirty_log(slot, u64_to_usize(region.len())); + }); + } + + /// Retrieves the KVM dirty bitmap for each of the guest's memory regions. + pub fn get_dirty_bitmap( + &self, + guest_memory: &GuestMemoryMmap, + ) -> Result { + use std::collections::HashMap; + let mut bitmap: crate::DirtyBitmap = HashMap::new(); + guest_memory + .iter() + .zip(0u32..) + .try_for_each(|(region, slot)| { + self.fd() + .get_dirty_log(slot, u64_to_usize(region.len())) + .map(|bitmap_region| _ = bitmap.insert(slot, bitmap_region)) + })?; + Ok(bitmap) + } + + /// Takes a snapshot of the virtual machine running inside the given [`Vmm`] and saves it to + /// `mem_file_path`. /// /// # Errors /// diff --git a/src/vmm/tests/integration_tests.rs b/src/vmm/tests/integration_tests.rs index 4312c6345db..2d8a7aed580 100644 --- a/src/vmm/tests/integration_tests.rs +++ b/src/vmm/tests/integration_tests.rs @@ -212,7 +212,7 @@ fn verify_create_snapshot(is_diff: bool) -> (TempFile, TempFile) { let snapshot_params = CreateSnapshotParams { snapshot_type, snapshot_path: snapshot_file.as_path().to_path_buf(), - mem_file_path: memory_file.as_path().to_path_buf(), + mem_file_path: Some(memory_file.as_path().to_path_buf()), }; controller diff --git a/tests/framework/http_api.py b/tests/framework/http_api.py index a1ee37174b0..1442a253b25 100644 --- a/tests/framework/http_api.py +++ b/tests/framework/http_api.py @@ -123,3 +123,5 @@ def __init__(self, api_usocket_full_name): self.snapshot_load = Resource(self, "/snapshot/load") self.cpu_config = Resource(self, "/cpu-config") self.entropy = Resource(self, "/entropy") + self.memory_mappings = Resource(self, "/memory/mappings") + self.memory = Resource(self, "/memory") diff --git a/tests/integration_tests/functional/test_api.py b/tests/integration_tests/functional/test_api.py index 5aebe7b5265..a03e8020098 100644 --- a/tests/integration_tests/functional/test_api.py +++ b/tests/integration_tests/functional/test_api.py @@ -18,6 +18,7 @@ import host_tools.network as net_tools from framework import utils_cpuid from framework.utils import get_firecracker_version_from_toml, is_io_uring_supported +from framework.microvm import HugePagesConfig MEM_LIMIT = 1000000000 @@ -1389,3 +1390,251 @@ def test_negative_snapshot_load_api(microvm_factory): # The snapshot/memory files above don't exist, but the request is otherwise syntactically valid. # In this case, Firecracker exits. vm.mark_killed() + + +def test_memory_mappings_pre_boot(uvm_plain): + """Test that memory mappings endpoint is not available before boot.""" + test_microvm = uvm_plain + test_microvm.spawn() + test_microvm.basic_config() + + # Use session directly since get() asserts on 200 + url = test_microvm.api.endpoint + "/memory/mappings" + res = test_microvm.api.session.get(url) + assert res.status_code == 400 + assert NOT_SUPPORTED_BEFORE_START in res.json()["fault_message"] + + +def test_memory_pre_boot(uvm_plain): + """Test that memory endpoint is not available before boot.""" + test_microvm = uvm_plain + test_microvm.spawn() + test_microvm.basic_config() + + # Use session directly since get() asserts on 200 + url = test_microvm.api.endpoint + "/memory" + res = test_microvm.api.session.get(url) + assert res.status_code == 400 + assert NOT_SUPPORTED_BEFORE_START in res.json()["fault_message"] + + +def test_memory_mappings_post_boot(uvm_plain): + """Test that memory mappings endpoint works after boot with hugepages.""" + test_microvm = uvm_plain + test_microvm.spawn() + test_microvm.basic_config(huge_pages=HugePagesConfig.HUGETLBFS_2MB) + test_microvm.start() + + response = test_microvm.api.memory_mappings.get() + assert response.status_code == 200 + + data = response.json() + assert isinstance(data, dict) + assert "mappings" in data + mappings = data["mappings"] + assert isinstance(mappings, list) + assert len(mappings) > 0 + + # Verify structure of each mapping + for mapping in mappings: + assert "base_host_virt_addr" in mapping + assert "size" in mapping + assert "offset" in mapping + assert "page_size" in mapping + assert isinstance(mapping["base_host_virt_addr"], int) + assert isinstance(mapping["size"], int) + assert isinstance(mapping["offset"], int) + assert isinstance(mapping["page_size"], int) + assert mapping["size"] > 0 + # Verify page size is 2MB (2097152 bytes) for hugepages + assert mapping["page_size"] == 2 * 1024 * 1024 + + +def test_memory_post_boot(uvm_plain): + """Test that memory endpoint works after boot with hugepages.""" + test_microvm = uvm_plain + test_microvm.spawn() + test_microvm.basic_config(huge_pages=HugePagesConfig.HUGETLBFS_2MB) + test_microvm.start() + + # Get memory mappings to determine page size and total memory + mappings_response = test_microvm.api.memory_mappings.get() + assert mappings_response.status_code == 200 + mappings_data = mappings_response.json() + assert isinstance(mappings_data, dict) + assert "mappings" in mappings_data + mappings = mappings_data["mappings"] + assert len(mappings) > 0 + + # All regions should have the same page size (2MB for hugepages) + page_size = mappings[0]["page_size"] + assert page_size == 2 * 1024 * 1024, "Expected 2MB page size for hugepages" + + # Verify all regions have the same page size + for mapping in mappings: + assert ( + mapping["page_size"] == page_size + ), "All regions must have the same page size" + + total_memory_size = sum(mapping["size"] for mapping in mappings) + total_pages = total_memory_size // page_size + expected_bitmap_size = (total_pages + 63) // 64 # ceil(total_pages / 64) + + # Get memory info + response = test_microvm.api.memory.get() + assert response.status_code == 200 + + data = response.json() + assert isinstance(data, dict) + assert "resident" in data + assert "empty" in data + resident_bitmap = data["resident"] + empty_bitmap = data["empty"] + assert isinstance(resident_bitmap, list) + assert isinstance(empty_bitmap, list) + assert len(resident_bitmap) == expected_bitmap_size + assert len(empty_bitmap) == expected_bitmap_size + + # Verify all values are valid u64 integers + for value in resident_bitmap: + assert isinstance(value, int) + assert value >= 0 + assert value <= 0xFFFFFFFFFFFFFFFF # Max u64 value + + for value in empty_bitmap: + assert isinstance(value, int) + assert value >= 0 + assert value <= 0xFFFFFFFFFFFFFFFF # Max u64 value + + # After boot, there should be at least one resident page + has_resident_page = any(value != 0 for value in resident_bitmap) + assert has_resident_page, "Expected at least one resident page after VM boot" + + # Empty pages should be a subset of resident pages + # (empty_bitmap & resident_bitmap) == empty_bitmap + for i in range(len(empty_bitmap)): + assert (empty_bitmap[i] & resident_bitmap[i]) == empty_bitmap[ + i + ], "Empty pages must be a subset of resident pages" + + +@pytest.mark.nonci +def test_memory_benchmark(microvm_factory, guest_kernel_linux_6_1, rootfs): + """Benchmark the memory endpoint performance (resident + zero page checking).""" + test_microvm = microvm_factory.build(guest_kernel_linux_6_1, rootfs) + test_microvm.spawn() + + # Use larger memory size for benchmarking + # Check available hugepages and use a size that fits (need at least some headroom) + # Default to 256MB if we can't determine, or use available - 64MB headroom + try: + with open("/sys/kernel/mm/hugepages/hugepages-2048kB/free_hugepages", "r") as f: + free_hugepages = int(f.read().strip()) + # Each hugepage is 2MB, reserve 32 pages (64MB) for system + available_mib = max(128, (free_hugepages - 32) * 2) + mem_size_mib = min(1024, available_mib) # Cap at 1GB for proper benchmark + except (FileNotFoundError, ValueError, OSError): + # Fallback to 256MB if we can't read hugepage info + mem_size_mib = 256 + test_microvm.basic_config( + mem_size_mib=mem_size_mib, huge_pages=HugePagesConfig.HUGETLBFS_2MB + ) + # Add network interface for SSH access + test_microvm.add_net_iface() + test_microvm.start() + + # Get memory mappings to determine actual memory size + mappings_response = test_microvm.api.memory_mappings.get() + assert mappings_response.status_code == 200 + mappings_data = mappings_response.json() + mappings = mappings_data["mappings"] + + # Calculate total memory size + total_memory_bytes = sum(mapping["size"] for mapping in mappings) + total_memory_mib = total_memory_bytes / (1024 * 1024) + page_size = mappings[0]["page_size"] + + # Ensure memory is resident by writing zeros to it via guest + # This will fault in the pages and make them resident + # Using tmpfs (/dev/shm) ensures the memory is actually resident + # Allocate a reasonable portion (e.g., 256MB) to avoid freezing the sandbox + fault_memory_mib = min(256, int(total_memory_mib * 0.25)) # 25% or max 256MB + test_microvm.ssh.run( + "dd if=/dev/zero of=/dev/shm/zero_mem bs=1M count={} 2>/dev/null || true".format( + fault_memory_mib + ) + ) + + # Give the system a moment to fault in pages + time.sleep(0.1) + + # Benchmark the /memory endpoint call + start_time = time.perf_counter() + response = test_microvm.api.memory.get() + end_time = time.perf_counter() + + assert response.status_code == 200 + data = response.json() + assert "resident" in data + assert "empty" in data + + # Verify the response is valid + resident_bitmap = data["resident"] + empty_bitmap = data["empty"] + + # Calculate expected bitmap size + page_size = mappings[0]["page_size"] + total_pages = total_memory_bytes // page_size + expected_bitmap_size = (total_pages + 63) // 64 + + assert len(resident_bitmap) == expected_bitmap_size + assert len(empty_bitmap) == expected_bitmap_size + + # Count actual resident pages (faulted-in memory) + resident_page_count = 0 + for bitmap_value in resident_bitmap: + # Count set bits in each u64 value + resident_page_count += bin(bitmap_value).count("1") + + # Calculate resident memory size (actual memory that was checked) + resident_memory_bytes = resident_page_count * page_size + resident_memory_mib = resident_memory_bytes / (1024 * 1024) + + # Calculate elapsed time and throughput based on actual resident memory + elapsed_seconds = end_time - start_time + + if resident_memory_bytes > 0: + throughput_mib_per_sec = resident_memory_mib / elapsed_seconds + time_per_mb_ms = (elapsed_seconds * 1000) / resident_memory_mib + else: + throughput_mib_per_sec = 0 + time_per_mb_ms = 0 + + # Count empty pages + empty_page_count = 0 + for bitmap_value in empty_bitmap: + empty_page_count += bin(bitmap_value).count("1") + + # Print benchmark results + print(f"\n{'='*60}") + print(f"Memory Benchmark Results") + print(f"{'='*60}") + print( + f"Total Memory: {total_memory_mib:.2f} MiB ({total_memory_bytes / (1024**3):.3f} GB)" + ) + print( + f"Resident Pages: {resident_page_count} / {total_pages} ({resident_page_count * 100 / total_pages:.1f}%)" + ) + print( + f"Resident Memory: {resident_memory_mib:.2f} MiB ({resident_memory_bytes / (1024**3):.3f} GB)" + ) + print( + f"Empty Pages: {empty_page_count} / {resident_page_count} ({empty_page_count * 100 / resident_page_count if resident_page_count > 0 else 0:.1f}% of resident)" + ) + print(f"Elapsed Time: {elapsed_seconds*1000:.2f} ms") + print(f"Throughput (resident): {throughput_mib_per_sec:.2f} MiB/s") + print(f"Time per MB (resident): {time_per_mb_ms:.3f} ms/MB") + print(f"{'='*60}\n") + + # Verify at least some pages are resident + assert resident_page_count > 0, "Expected at least one resident page"