From b0b00cca4b02d0a5c7a4b14692194ebe28fca146 Mon Sep 17 00:00:00 2001 From: Woojoong Kim Date: Mon, 11 May 2026 13:59:13 -0700 Subject: [PATCH 1/2] add GPU passthrough fix patch for Kubevirt --- ...hostdev-failing-to-restart-after-hot.patch | 286 ++++++++++++++++++ SPECS/kubevirt/kubevirt.spec | 6 +- 2 files changed, 291 insertions(+), 1 deletion(-) create mode 100644 SPECS/kubevirt/0001-Fix-VM-with-PCI-hostdev-failing-to-restart-after-hot.patch diff --git a/SPECS/kubevirt/0001-Fix-VM-with-PCI-hostdev-failing-to-restart-after-hot.patch b/SPECS/kubevirt/0001-Fix-VM-with-PCI-hostdev-failing-to-restart-after-hot.patch new file mode 100644 index 00000000000..e6906920e56 --- /dev/null +++ b/SPECS/kubevirt/0001-Fix-VM-with-PCI-hostdev-failing-to-restart-after-hot.patch @@ -0,0 +1,286 @@ +From d37d3d8335a19324f372dd24e2344563559c096e Mon Sep 17 00:00:00 2001 +From: Michael Henriksen +Date: Fri, 17 Apr 2026 23:29:54 -0400 +Subject: [PATCH] Fix VM with PCI hostdev failing to restart after hotplug + block volume + +When a hotplug block volume is mounted into the virt-launcher pod, +allowBlockMajorMinor() calls cgroupManager.Set() to add the block +device to the cgroup allowlist. On cgroups v2, this replaces the +entire eBPF device filter program. The v2Manager rebuilds the program +from its in-memory rule cache, which is initialized from +generateDeviceRulesForVMI() and does not include devices provisioned +by device plugins. This wipes access to device-plugin-provided nodes +such as /dev/vfio/* (PCI/MDEV/GPU/SR-IOV passthrough) and +/dev/bus/usb/* (USB passthrough), causing libvirt to fail with +"pci backend driver type 'default' is not supported" when starting +the domain. + +Fix by recursively scanning /dev/vfio/ and /dev/bus/usb/ inside the +container and including all discovered device rules in the initial +cache so they are preserved when the eBPF program is rebuilt. + +Fixes: https://github.com/kubevirt/kubevirt/issues/17124 + +Signed-off-by: Michael Henriksen +Co-Authored-By: Claude Opus 4.6 (1M context) +--- + pkg/virt-handler/cgroup/BUILD.bazel | 3 + + pkg/virt-handler/cgroup/cgroup_test.go | 50 +++++++++++++++++ + pkg/virt-handler/cgroup/util.go | 63 +++++++++++++++++++++ + tests/storage/hotplug.go | 77 ++++++++++++++++++++++++++ + 4 files changed, 193 insertions(+) + +diff --git a/pkg/virt-handler/cgroup/BUILD.bazel b/pkg/virt-handler/cgroup/BUILD.bazel +index ace69f1d78..4f4ec95714 100644 +--- a/pkg/virt-handler/cgroup/BUILD.bazel ++++ b/pkg/virt-handler/cgroup/BUILD.bazel +@@ -40,6 +40,9 @@ go_test( + embed = [":go_default_library"], + race = "on", + deps = [ ++ "//pkg/safepath:go_default_library", ++ "//pkg/virt-handler/isolation:go_default_library", ++ "//staging/src/kubevirt.io/api/core/v1:go_default_library", + "//staging/src/kubevirt.io/client-go/testutils:go_default_library", + "//vendor/github.com/onsi/ginkgo/v2:go_default_library", + "//vendor/github.com/onsi/gomega:go_default_library", +diff --git a/pkg/virt-handler/cgroup/cgroup_test.go b/pkg/virt-handler/cgroup/cgroup_test.go +index 50b5198e2a..53450e2a06 100644 +--- a/pkg/virt-handler/cgroup/cgroup_test.go ++++ b/pkg/virt-handler/cgroup/cgroup_test.go +@@ -20,12 +20,20 @@ + package cgroup + + import ( ++ "os" ++ "path/filepath" ++ + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + runc_cgroups "github.com/opencontainers/runc/libcontainer/cgroups" + runc_configs "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/devices" + "go.uber.org/mock/gomock" ++ ++ v1 "kubevirt.io/api/core/v1" ++ ++ "kubevirt.io/kubevirt/pkg/safepath" ++ "kubevirt.io/kubevirt/pkg/virt-handler/isolation" + ) + + var _ = Describe("cgroup manager", func() { +@@ -195,3 +203,45 @@ var _ = Describe("cgroup manager", func() { + ), + ) + }) ++ ++var _ = Describe("generateDeviceRulesForVMI", func() { ++ var ( ++ ctrl *gomock.Controller ++ tempDir string ++ ) ++ ++ BeforeEach(func() { ++ ctrl = gomock.NewController(GinkgoT()) ++ tempDir = GinkgoT().TempDir() ++ Expect(os.MkdirAll(filepath.Join(tempDir, "dev"), 0755)).To(Succeed()) ++ }) ++ ++ newMockIsolationWithMountRoot := func() isolation.IsolationResult { ++ mountRoot, err := safepath.NewPathNoFollow(tempDir) ++ Expect(err).ToNot(HaveOccurred()) ++ ++ mockIso := isolation.NewMockIsolationResult(ctrl) ++ mockIso.EXPECT().MountRoot().Return(mountRoot, nil) ++ return mockIso ++ } ++ ++ It("should not fail when /dev/vfio does not exist", func() { ++ rules, err := generateDeviceRulesForVMI(&v1.VirtualMachineInstance{}, newMockIsolationWithMountRoot(), "") ++ Expect(err).ToNot(HaveOccurred()) ++ Expect(rules).To(BeEmpty()) ++ }) ++ ++ It("should not fail when /dev/vfio exists but is empty", func() { ++ Expect(os.MkdirAll(filepath.Join(tempDir, "dev", "vfio"), 0755)).To(Succeed()) ++ rules, err := generateDeviceRulesForVMI(&v1.VirtualMachineInstance{}, newMockIsolationWithMountRoot(), "") ++ Expect(err).ToNot(HaveOccurred()) ++ Expect(rules).To(BeEmpty()) ++ }) ++ ++ It("should not fail when /dev/bus/usb exists but is empty", func() { ++ Expect(os.MkdirAll(filepath.Join(tempDir, "dev", "bus", "usb"), 0755)).To(Succeed()) ++ rules, err := generateDeviceRulesForVMI(&v1.VirtualMachineInstance{}, newMockIsolationWithMountRoot(), "") ++ Expect(err).ToNot(HaveOccurred()) ++ Expect(rules).To(BeEmpty()) ++ }) ++}) +diff --git a/pkg/virt-handler/cgroup/util.go b/pkg/virt-handler/cgroup/util.go +index 892113c83d..cce3ebfcac 100644 +--- a/pkg/virt-handler/cgroup/util.go ++++ b/pkg/virt-handler/cgroup/util.go +@@ -191,9 +191,72 @@ func generateDeviceRulesForVMI(vmi *v1.VirtualMachineInstance, isolationRes isol + } + } + ++ // Device-plugin-provisioned devices (VFIO, USB) must be in the cgroup ++ // rule cache so they survive eBPF program rebuilds during hotplug. ++ for _, devDir := range []string{ ++ filepath.Join("dev", "vfio"), ++ filepath.Join("dev", "bus", "usb"), ++ } { ++ rules, err := discoverDeviceRulesInDir(mountRoot, devDir) ++ if err != nil { ++ return nil, fmt.Errorf("failed to discover device rules in %s: %v", devDir, err) ++ } ++ vmiDeviceRules = append(vmiDeviceRules, rules...) ++ } ++ + return vmiDeviceRules, nil + } + ++// discoverDeviceRulesInDir recursively scans a directory under the ++// container's filesystem and creates allow rules for all device nodes ++// found. These devices are provisioned by device plugins or the container ++// runtime and must be preserved in the v2 cgroup manager's rule cache so ++// they are not lost when the eBPF device filter is rebuilt by subsequent ++// Set() calls (e.g. during hotplug volume mounting). ++func discoverDeviceRulesInDir(mountRoot *safepath.Path, relPath string) ([]*devices.Rule, error) { ++ dirPath, err := safepath.JoinNoFollow(mountRoot, relPath) ++ if err != nil { ++ if errors.Is(err, os.ErrNotExist) { ++ return nil, nil ++ } ++ return nil, err ++ } ++ ++ var entries []os.DirEntry ++ err = dirPath.ExecuteNoFollow(func(path string) (err error) { ++ entries, err = os.ReadDir(path) ++ return err ++ }) ++ if err != nil { ++ return nil, err ++ } ++ ++ var rules []*devices.Rule ++ for _, entry := range entries { ++ if entry.IsDir() { ++ subRules, err := discoverDeviceRulesInDir(mountRoot, filepath.Join(relPath, entry.Name())) ++ if err != nil { ++ return nil, err ++ } ++ rules = append(rules, subRules...) ++ continue ++ } ++ devPath, err := safepath.JoinNoFollow(dirPath, entry.Name()) ++ if err != nil { ++ return nil, err ++ } ++ rule, err := newAllowedDeviceRule(devPath) ++ if err != nil { ++ return nil, fmt.Errorf("failed to create device rule for %s/%s: %v", relPath, entry.Name(), err) ++ } ++ if rule != nil { ++ log.Log.V(loggingVerbosity).Infof("device rule for %s/%s: %v", relPath, entry.Name(), rule) ++ rules = append(rules, rule) ++ } ++ } ++ return rules, nil ++} ++ + func newAllowedDeviceRule(devicePath *safepath.Path) (*devices.Rule, error) { + fileInfo, err := safepath.StatAtNoFollow(devicePath) + if err != nil { +diff --git a/tests/storage/hotplug.go b/tests/storage/hotplug.go +index 00e7c9607b..fe4e5595e1 100644 +--- a/tests/storage/hotplug.go ++++ b/tests/storage/hotplug.go +@@ -2265,6 +2265,83 @@ var _ = Describe(SIG("Hotplug", func() { + verifyVolumeNolongerAccessible(vmi, targets[0]) + }) + }) ++ ++ // Regression test for https://github.com/kubevirt/kubevirt/issues/17124 ++ Context("with PCI hostdev", Serial, func() { ++ const deviceName = "example.org/soundcard" ++ ++ BeforeEach(func() { ++ kvconfig.EnableFeatureGate(featuregate.HostDevicesGate) ++ ++ kv := libkubevirt.GetCurrentKv(virtClient) ++ config := kv.Spec.Configuration ++ config.PermittedHostDevices = &v1.PermittedHostDevices{ ++ PciHostDevices: []v1.PciHostDevice{ ++ { ++ PCIVendorSelector: "8086:2668", ++ ResourceName: deviceName, ++ }, ++ }, ++ } ++ kvconfig.UpdateKubeVirtConfigValueAndWait(config) ++ }) ++ ++ AfterEach(func() { ++ kv := libkubevirt.GetCurrentKv(virtClient) ++ config := kv.Spec.Configuration ++ config.PermittedHostDevices = &v1.PermittedHostDevices{} ++ kvconfig.UpdateKubeVirtConfigValueAndWait(config) ++ kvconfig.DisableFeatureGate(featuregate.HostDevicesGate) ++ }) ++ ++ It("should restart a VM after hotplugging a block volume", decorators.RequiresBlockStorage, func() { ++ sc, exists := libstorage.GetRWOBlockStorageClass() ++ if !exists { ++ Fail("Fail test when block storage class is not available") ++ } ++ ++ vmiSpec := libvmifact.NewAlpineWithTestTooling() ++ vmiSpec.Spec.Domain.Devices.HostDevices = []v1.HostDevice{ ++ {Name: "sound0", DeviceName: deviceName}, ++ } ++ vm, err := virtClient.VirtualMachine(testsuite.GetTestNamespace(nil)).Create( ++ context.Background(), ++ libvmi.NewVirtualMachine(vmiSpec, libvmi.WithRunStrategy(v1.RunStrategyAlways)), ++ metav1.CreateOptions{}, ++ ) ++ Expect(err).ToNot(HaveOccurred()) ++ Eventually(matcher.ThisVM(vm)).WithTimeout(300 * time.Second).WithPolling(time.Second).Should(matcher.BeReady()) ++ ++ vmi, err := virtClient.VirtualMachineInstance(vm.Namespace).Get(context.Background(), vm.Name, metav1.GetOptions{}) ++ Expect(err).ToNot(HaveOccurred()) ++ libwait.WaitForSuccessfulVMIStart(vmi, libwait.WithTimeout(240)) ++ ++ dvBuilder := libdv.NewDataVolume( ++ libdv.WithBlankImageSource(), ++ libdv.WithStorage( ++ libdv.StorageWithStorageClass(sc), ++ libdv.StorageWithVolumeSize(cd.BlankVolumeSize), ++ libdv.StorageWithVolumeMode(k8sv1.PersistentVolumeBlock), ++ ), ++ ) ++ dv, err := virtClient.CdiClient().CdiV1beta1().DataVolumes(testsuite.GetTestNamespace(nil)).Create( ++ context.Background(), dvBuilder, metav1.CreateOptions{}) ++ Expect(err).ToNot(HaveOccurred()) ++ libstorage.EventuallyDV(dv, 240, Or(matcher.HaveSucceeded(), matcher.WaitForFirstConsumer())) ++ ++ By("Hotplugging a block volume to the running VM") ++ addVolumeVMWithSource(vm.Name, vm.Namespace, getAddVolumeOptions("hotplug-vol", v1.DiskBusSCSI, &v1.HotplugVolumeSource{ ++ DataVolume: &v1.DataVolumeSource{Name: dv.Name}, ++ }, false, false, "")) ++ verifyVolumeStatus(vmi, v1.VolumeReady, "", "hotplug-vol") ++ ++ By("Restarting the VM") ++ vm = libvmops.StopVirtualMachine(vm) ++ err = virtClient.VirtualMachine(vm.Namespace).Start(context.Background(), vm.Name, &v1.StartOptions{}) ++ Expect(err).ToNot(HaveOccurred()) ++ Eventually(matcher.ThisVM(vm), 300*time.Second, time.Second).Should(matcher.BeReady()) ++ }) ++ }) + })) + + func verifyVolumeAndDiskVMAdded(virtClient kubecli.KubevirtClient, vm *v1.VirtualMachine, volumeNames ...string) { +-- +2.34.1 + diff --git a/SPECS/kubevirt/kubevirt.spec b/SPECS/kubevirt/kubevirt.spec index f35c97cb0e3..ade613ebdbb 100644 --- a/SPECS/kubevirt/kubevirt.spec +++ b/SPECS/kubevirt/kubevirt.spec @@ -20,7 +20,7 @@ Summary: Container native virtualization Name: kubevirt Version: 1.7.1 -Release: 7%{?dist} +Release: 8%{?dist} License: ASL 2.0 Vendor: Microsoft Corporation Distribution: Azure Linux @@ -44,6 +44,7 @@ Patch13: CVE-2026-39835.patch Patch14: CVE-2026-42502.patch Patch15: CVE-2026-7374.patch Patch16: CVE-2026-33814.patch +Patch17: 0001-Fix-VM-with-PCI-hostdev-failing-to-restart-after-hot.patch %global debug_package %{nil} BuildRequires: swtpm-tools @@ -281,6 +282,9 @@ install -p -m 0644 cmd/virt-launcher/qemu.conf %{buildroot}%{_datadir}/kube-virt %{_bindir}/virt-tests %changelog +* Wed Jun 10 2026 Woojoong Kim - 1.7.1-8 +- Add PCI passthrough patch + * Tue Jun 02 2026 Azure Linux Security Servicing Account - 1.7.1-7 - Patch for CVE-2026-33814 From e1f17dce3eca586f38b087e444b96ce8b106316b Mon Sep 17 00:00:00 2001 From: Max Weiss Date: Wed, 10 Jun 2026 21:54:58 -0700 Subject: [PATCH 2/2] Patch kubevirt to fix hotplug volume detach deadlocks Add two virt-handler/virt-controller patches addressing hotplug volume detach deadlocks observed in production on KubeVirt v1.7.1 (ICM 21000001017910 and 21000001021380): - 0002: virt-handler mountFromPod() skips mounting volumes no longer in VMI spec, so it stops resurrecting the block device of a removed volume each reconcile. Unmount() can then clean it up, IsMounted() returns false, and updateHotplugVolumeStatus() advances the phase to UnMountedFromPod, letting virt-controller delete the attachment pod. - 0003: virt-controller cleanupAttachmentPods() only keeps an old Running attachment pod as fallback if it still holds volumes worth preserving (in-spec, or in a deletion-blocking phase), avoiding cross-VMI RWO deadlocks during PVC reshuffling. Both patches include unit tests adapted to v1.7.1. Bumps Release to 9. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ...lume-detach-deadlock-in-virt-handler.patch | 190 ++++++++++++++ ...ds-fallback-keeping-useless-old-pods.patch | 243 ++++++++++++++++++ SPECS/kubevirt/kubevirt.spec | 8 +- 3 files changed, 440 insertions(+), 1 deletion(-) create mode 100644 SPECS/kubevirt/0002-Fix-hotplug-volume-detach-deadlock-in-virt-handler.patch create mode 100644 SPECS/kubevirt/0003-Fix-cleanupAttachmentPods-fallback-keeping-useless-old-pods.patch diff --git a/SPECS/kubevirt/0002-Fix-hotplug-volume-detach-deadlock-in-virt-handler.patch b/SPECS/kubevirt/0002-Fix-hotplug-volume-detach-deadlock-in-virt-handler.patch new file mode 100644 index 00000000000..708b0440f53 --- /dev/null +++ b/SPECS/kubevirt/0002-Fix-hotplug-volume-detach-deadlock-in-virt-handler.patch @@ -0,0 +1,190 @@ +From bff2da131d379df5defdaa400461e5cdee6cb71c Mon Sep 17 00:00:00 2001 +From: Max Weiss +Date: Wed, 10 Jun 2026 22:23:36 -0700 +Subject: [PATCH] Fix hotplug volume detach deadlock in virt-handler + +When a hotplug volume is removed from VMI spec (via RemoveVolume API), +a deadlock can occur because mountFromPod() in the reconcile loop +continues to mount volumes that are in VMI status but no longer in spec. +This recreates the block device on every cycle, preventing Unmount() +from cleaning it up and IsMounted() from returning false, which means +the volume phase never advances past VolumeReady/MountedToPod. + +The root cause is that mountFromPod() iterates vmi.Status.VolumeStatus +without checking whether the volume is still in vmi.Spec.Volumes. This +creates a mutual dependency: virt-controller waits for the phase to +advance before deleting the attachment pod, but virt-handler keeps +re-mounting the volume, preventing the phase from advancing. + +Fix: Skip volumes in mountFromPod() that are no longer in VMI spec. +This allows Unmount() to clean up stale mounts, IsMounted() to return +false, and the existing 'mounted=false AND NOT in spec' code path in +updateHotplugVolumeStatus() to advance the phase to UnMountedFromPod. + +Also add a log message in updateHotplugVolumeStatus() for the +'mounted=true AND NOT in spec' transient state, improving observability +during the detach process. + +Signed-off-by: Max Weiss + +Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> +--- + pkg/virt-handler/hotplug-disk/mount.go | 17 +++++ + pkg/virt-handler/hotplug-disk/mount_test.go | 77 +++++++++++++++++++++ + pkg/virt-handler/vm.go | 5 ++ + 3 files changed, 99 insertions(+) + +diff --git a/pkg/virt-handler/hotplug-disk/mount.go b/pkg/virt-handler/hotplug-disk/mount.go +index d20dcb6461..9870cec88e 100644 +--- a/pkg/virt-handler/hotplug-disk/mount.go ++++ b/pkg/virt-handler/hotplug-disk/mount.go +@@ -332,11 +332,28 @@ func (m *volumeMounter) mountFromPod(vmi *v1.VirtualMachineInstance, sourceUID t + if err != nil { + return err + } ++ ++ // Build set of volumes currently in spec so we can skip mounting ++ // volumes that have been removed. Without this check, Mount() recreates ++ // block devices for removed volumes on every reconcile cycle, preventing ++ // Unmount() from cleaning them up and causing a deadlock where the ++ // volume phase never advances past VolumeReady/MountedToPod. ++ specVolumes := make(map[string]struct{}) ++ for i := range vmi.Spec.Volumes { ++ specVolumes[vmi.Spec.Volumes[i].Name] = struct{}{} ++ } ++ + for _, volumeStatus := range vmi.Status.VolumeStatus { + if volumeStatus.HotplugVolume == nil { + // Skip non hotplug volumes + continue + } ++ ++ if _, inSpec := specVolumes[volumeStatus.Name]; !inSpec { ++ log.Log.Object(vmi).V(3).Infof("Skipping mount for volume %s: no longer in VMI spec", volumeStatus.Name) ++ continue ++ } ++ + mountDirectory := false + if volumeStatus.MemoryDumpVolume != nil { + mountDirectory = true +diff --git a/pkg/virt-handler/hotplug-disk/mount_test.go b/pkg/virt-handler/hotplug-disk/mount_test.go +index 88bea94119..8d102f5f5b 100644 +--- a/pkg/virt-handler/hotplug-disk/mount_test.go ++++ b/pkg/virt-handler/hotplug-disk/mount_test.go +@@ -830,6 +830,33 @@ var _ = Describe("HotplugVolume", func() { + }, + }) + vmi.Status.VolumeStatus = volumeStatuses ++ ++ // Hotplug volumes must be in spec for Mount to process them ++ vmi.Spec.Volumes = []v1.Volume{ ++ {Name: "permanent"}, ++ { ++ Name: "filesystemvolume", ++ VolumeSource: v1.VolumeSource{ ++ PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ ++ PersistentVolumeClaimVolumeSource: k8sv1.PersistentVolumeClaimVolumeSource{ ++ ClaimName: "filesystemvolume", ++ }, ++ Hotpluggable: true, ++ }, ++ }, ++ }, ++ { ++ Name: "blockvolume", ++ VolumeSource: v1.VolumeSource{ ++ PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ ++ PersistentVolumeClaimVolumeSource: k8sv1.PersistentVolumeClaimVolumeSource{ ++ ClaimName: "blockvolume", ++ }, ++ Hotpluggable: true, ++ }, ++ }, ++ }, ++ } + deviceBasePath = func(podUID types.UID, kubeletPodDir string) (*safepath.Path, error) { + return newDir(tempDir, string(podUID), "volumeDevices") + } +@@ -898,6 +925,29 @@ var _ = Describe("HotplugVolume", func() { + Expect(err).To(HaveOccurred(), "block device volume still exists %s", blockVolume) + }) + ++ It("should skip mounting hotplug volumes no longer in VMI spec", func() { ++ vmi := api.NewMinimalVMI("fake-vmi") ++ vmi.UID = "1234" ++ vmi.Spec.Volumes = []v1.Volume{} ++ vmi.Status.VolumeStatus = []v1.VolumeStatus{ ++ { ++ Name: "removed-vol", ++ Phase: v1.VolumeReady, ++ HotplugVolume: &v1.HotplugVolumeStatus{ ++ AttachPodName: "test-pod", ++ AttachPodUID: "test-uid", ++ }, ++ }, ++ } ++ ++ err = m.mountFromPod(vmi, "", cgroupManagerMock) ++ Expect(err).ToNot(HaveOccurred()) ++ ++ record, err := m.getMountTargetRecord(vmi) ++ Expect(err).ToNot(HaveOccurred()) ++ Expect(record.MountTargetEntries).To(BeEmpty()) ++ }) ++ + It("Should not do anything if vmi has no hotplug volumes", func() { + volumeStatuses := make([]v1.VolumeStatus, 0) + volumeStatuses = append(volumeStatuses, v1.VolumeStatus{ +@@ -938,6 +988,33 @@ var _ = Describe("HotplugVolume", func() { + }, + }) + vmi.Status.VolumeStatus = volumeStatuses ++ ++ // Hotplug volumes must be in spec for Mount to process them ++ vmi.Spec.Volumes = []v1.Volume{ ++ {Name: "permanent"}, ++ { ++ Name: "filesystemvolume", ++ VolumeSource: v1.VolumeSource{ ++ PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ ++ PersistentVolumeClaimVolumeSource: k8sv1.PersistentVolumeClaimVolumeSource{ ++ ClaimName: "filesystemvolume", ++ }, ++ Hotpluggable: true, ++ }, ++ }, ++ }, ++ { ++ Name: "blockvolume", ++ VolumeSource: v1.VolumeSource{ ++ PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ ++ PersistentVolumeClaimVolumeSource: k8sv1.PersistentVolumeClaimVolumeSource{ ++ ClaimName: "blockvolume", ++ }, ++ Hotpluggable: true, ++ }, ++ }, ++ }, ++ } + deviceBasePath = func(podUID types.UID, kubeletPodDir string) (*safepath.Path, error) { + return newDir(tempDir, string(podUID), "volumeDevices") + } +diff --git a/pkg/virt-handler/vm.go b/pkg/virt-handler/vm.go +index eea9748258..5e2fd6a02d 100644 +--- a/pkg/virt-handler/vm.go ++++ b/pkg/virt-handler/vm.go +@@ -486,6 +486,11 @@ func (c *VirtualMachineController) updateHotplugVolumeStatus(vmi *v1.VirtualMach + volumeStatus.Phase = v1.HotplugVolumeMounted + volumeStatus.Message = fmt.Sprintf("Volume %s has been mounted in virt-launcher pod", volumeStatus.Name) + volumeStatus.Reason = VolumeMountedToPodReason ++ } else if _, ok := specVolumeMap[volumeStatus.Name]; !ok { ++ // Volume is mounted but no longer in spec. This is a transient state ++ // that should resolve once Unmount() cleans up the stale mount. ++ // Re-queue (needsRefresh is already true) to check again soon. ++ c.logger.Object(vmi).V(3).Infof("Volume %s is mounted but no longer in spec, waiting for unmount to clean up", volumeStatus.Name) + } + } else { + // Not mounted, check if the volume is in the spec, if not update status +-- +2.54.0 + diff --git a/SPECS/kubevirt/0003-Fix-cleanupAttachmentPods-fallback-keeping-useless-old-pods.patch b/SPECS/kubevirt/0003-Fix-cleanupAttachmentPods-fallback-keeping-useless-old-pods.patch new file mode 100644 index 00000000000..70c40b5edce --- /dev/null +++ b/SPECS/kubevirt/0003-Fix-cleanupAttachmentPods-fallback-keeping-useless-old-pods.patch @@ -0,0 +1,243 @@ +From 723925e7d9d10e283f20f512773ec2a32bd3d170 Mon Sep 17 00:00:00 2001 +From: Max Weiss +Date: Wed, 10 Jun 2026 22:23:36 -0700 +Subject: [PATCH] Fix cleanupAttachmentPods fallback keeping useless old pods + +The fallback logic in cleanupAttachmentPods() keeps the first old Running +attachment pod alive when the new pod is not yet Running, to preserve +volume accessibility. However, it does not check whether the old pod +actually contains any volumes worth preserving. An old pod whose hotplug +volumes are all in Detaching phase (removed from VMI spec, fully +unmounted by virt-handler) is kept alive pointlessly. + +This becomes a permanent deadlock when two or more VMIs are +simultaneously exchanging RWO PVCs: VMI A's old pod holds PVCs that +VMI B's new pod needs, and VMI B's old pod holds PVCs that VMI A's new +pod needs. Neither old pod is deleted (fallback logic), so neither new +pod can start (RWO conflict). + +Fix: Add podContainsVolumesToPreserve() check to the fallback condition. +An old pod is only kept as fallback if it contains at least one hotplug +volume that is either still in the VMI spec (VMI needs it) or in a +status phase that blocks pod deletion (Ready/MountedToPod). Old pods +with only Detaching/UnMounted volumes fall through to the normal +deletion path. + +Signed-off-by: Max Weiss + +Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> +--- + pkg/virt-controller/watch/vmi/vmi_test.go | 146 ++++++++++++++++++ + .../watch/vmi/volume-hotplug.go | 23 ++- + 2 files changed, 168 insertions(+), 1 deletion(-) + +diff --git a/pkg/virt-controller/watch/vmi/vmi_test.go b/pkg/virt-controller/watch/vmi/vmi_test.go +index e88b09b664..35b93fb7b2 100644 +--- a/pkg/virt-controller/watch/vmi/vmi_test.go ++++ b/pkg/virt-controller/watch/vmi/vmi_test.go +@@ -3672,6 +3672,152 @@ var _ = Describe("VirtualMachineInstance watcher", func() { + }, + }, true), + ) ++ ++ It("should delete old running pod kept as fallback when it only has Detaching volumes", func() { ++ shouldExpectPodDeletion := func(pod *k8sv1.Pod) { ++ kubeClient.Fake.PrependReactor("delete", "pods", func(action testing.Action) (handled bool, obj k8sruntime.Object, err error) { ++ update, ok := action.(testing.DeleteAction) ++ Expect(ok).To(BeTrue()) ++ Expect(pod.Namespace).To(Equal(update.GetNamespace())) ++ return true, nil, nil ++ }) ++ } ++ ++ vmi := watchtesting.NewRunningVirtualMachine("testvmi", &k8sv1.Node{ ++ ObjectMeta: metav1.ObjectMeta{ ++ Name: "testnode", ++ }, ++ }) ++ // Volume "detaching-vol" has been removed from spec and is in Detaching phase ++ vmi.Status.VolumeStatus = []virtv1.VolumeStatus{ ++ { ++ Name: "detaching-vol", ++ Phase: virtv1.HotplugVolumeDetaching, ++ HotplugVolume: &virtv1.HotplugVolumeStatus{ ++ AttachPodName: "old-pod", ++ AttachPodUID: "old-uid", ++ }, ++ }, ++ } ++ // No hotplug volumes in spec (all removed) ++ vmi.Spec.Volumes = []virtv1.Volume{} ++ ++ readyVolumes := []*virtv1.Volume{{}} ++ oldPod := &k8sv1.Pod{ ++ ObjectMeta: metav1.ObjectMeta{ ++ Name: "old-pod", ++ Namespace: vmi.Namespace, ++ }, ++ Spec: k8sv1.PodSpec{ ++ Volumes: []k8sv1.Volume{ ++ {Name: "detaching-vol"}, ++ }, ++ }, ++ Status: k8sv1.PodStatus{ ++ Phase: k8sv1.PodRunning, ++ }, ++ } ++ shouldExpectPodDeletion(oldPod) ++ ++ // currentPod is nil (new pod not running yet), but old pod has no useful volumes ++ err := controller.cleanupAttachmentPods(nil, []*k8sv1.Pod{oldPod}, vmi, len(readyVolumes)) ++ Expect(err).ToNot(HaveOccurred()) ++ testutils.ExpectEvent(recorder, kvcontroller.SuccessfulDeletePodReason) ++ }) ++ ++ It("should keep old running pod as fallback when it has in-spec volumes", func() { ++ vmi := watchtesting.NewRunningVirtualMachine("testvmi", &k8sv1.Node{ ++ ObjectMeta: metav1.ObjectMeta{ ++ Name: "testnode", ++ }, ++ }) ++ // Volume "active-vol" is still in spec ++ vmi.Spec.Volumes = []virtv1.Volume{ ++ { ++ Name: "active-vol", ++ VolumeSource: virtv1.VolumeSource{ ++ PersistentVolumeClaim: &virtv1.PersistentVolumeClaimVolumeSource{ ++ PersistentVolumeClaimVolumeSource: k8sv1.PersistentVolumeClaimVolumeSource{ ++ ClaimName: "active-pvc", ++ }, ++ Hotpluggable: true, ++ }, ++ }, ++ }, ++ } ++ vmi.Status.VolumeStatus = []virtv1.VolumeStatus{ ++ { ++ Name: "active-vol", ++ Phase: virtv1.VolumeReady, ++ HotplugVolume: &virtv1.HotplugVolumeStatus{ ++ AttachPodName: "old-pod", ++ AttachPodUID: "old-uid", ++ }, ++ }, ++ } ++ ++ readyVolumes := []*virtv1.Volume{{}} ++ oldPod := &k8sv1.Pod{ ++ ObjectMeta: metav1.ObjectMeta{ ++ Name: "old-pod", ++ Namespace: vmi.Namespace, ++ }, ++ Spec: k8sv1.PodSpec{ ++ Volumes: []k8sv1.Volume{ ++ {Name: "active-vol"}, ++ }, ++ }, ++ Status: k8sv1.PodStatus{ ++ Phase: k8sv1.PodRunning, ++ }, ++ } ++ ++ // currentPod is nil, old pod has in-spec volume → should be kept ++ err := controller.cleanupAttachmentPods(nil, []*k8sv1.Pod{oldPod}, vmi, len(readyVolumes)) ++ Expect(err).ToNot(HaveOccurred()) ++ // No deletion event expected — pod is kept as fallback ++ }) ++ ++ It("should keep old running pod as fallback when it has removed volume in deletion-blocking phase", func() { ++ vmi := watchtesting.NewRunningVirtualMachine("testvmi", &k8sv1.Node{ ++ ObjectMeta: metav1.ObjectMeta{ ++ Name: "testnode", ++ }, ++ }) ++ // Volume removed from spec but still in VolumeReady phase (not yet unmounted) ++ vmi.Spec.Volumes = []virtv1.Volume{} ++ vmi.Status.VolumeStatus = []virtv1.VolumeStatus{ ++ { ++ Name: "blocking-vol", ++ Phase: virtv1.VolumeReady, ++ HotplugVolume: &virtv1.HotplugVolumeStatus{ ++ AttachPodName: "old-pod", ++ AttachPodUID: "old-uid", ++ }, ++ }, ++ } ++ ++ readyVolumes := []*virtv1.Volume{{}} ++ oldPod := &k8sv1.Pod{ ++ ObjectMeta: metav1.ObjectMeta{ ++ Name: "old-pod", ++ Namespace: vmi.Namespace, ++ }, ++ Spec: k8sv1.PodSpec{ ++ Volumes: []k8sv1.Volume{ ++ {Name: "blocking-vol"}, ++ }, ++ }, ++ Status: k8sv1.PodStatus{ ++ Phase: k8sv1.PodRunning, ++ }, ++ } ++ ++ // currentPod nil, old pod has VolumeReady (deletion-blocking) volume → should be kept ++ err := controller.cleanupAttachmentPods(nil, []*k8sv1.Pod{oldPod}, vmi, len(readyVolumes)) ++ Expect(err).ToNot(HaveOccurred()) ++ // No deletion event expected — pod has volume in deletion-blocking phase ++ }) + }) + + Context("topology hints", func() { +diff --git a/pkg/virt-controller/watch/vmi/volume-hotplug.go b/pkg/virt-controller/watch/vmi/volume-hotplug.go +index 54fde22d51..0db143cac2 100644 +--- a/pkg/virt-controller/watch/vmi/volume-hotplug.go ++++ b/pkg/virt-controller/watch/vmi/volume-hotplug.go +@@ -84,8 +84,10 @@ func (c *Controller) cleanupAttachmentPods(currentPod *k8sv1.Pod, oldPods []*k8s + } + } + ++ specHotplugVolumes := make(map[string]struct{}) + for _, vmiVolume := range vmi.Spec.Volumes { + if storagetypes.IsHotplugVolume(&vmiVolume) { ++ specHotplugVolumes[vmiVolume.Name] = struct{}{} + delete(statusMap, vmiVolume.Name) + } + } +@@ -95,7 +97,8 @@ func (c *Controller) cleanupAttachmentPods(currentPod *k8sv1.Pod, oldPods []*k8s + if !foundRunning && + attachmentPod.Status.Phase == k8sv1.PodRunning && attachmentPod.DeletionTimestamp == nil && + numReadyVolumes > 0 && +- currentPodIsNotRunning { ++ currentPodIsNotRunning && ++ podContainsVolumesToPreserve(attachmentPod, statusMap, specHotplugVolumes) { + foundRunning = true + continue + } +@@ -132,6 +135,24 @@ func volumeReadyForPodDelete(phase v1.VolumePhase) bool { + return true + } + ++// podContainsVolumesToPreserve returns true if the pod contains at least one ++// hotplug volume that justifies keeping the pod alive as a fallback: ++// - A volume still in the VMI spec (the VMI needs it until the new pod is Running) ++// - A volume being detached but whose phase still blocks pod deletion ++// An old pod whose hotplug volumes are all in Detaching/UnMounted phase provides ++// no fallback value and should not block PVC cleanup on other VMIs. ++func podContainsVolumesToPreserve(pod *k8sv1.Pod, statusMap map[string]v1.VolumeStatus, specHotplugVolumes map[string]struct{}) bool { ++ for _, podVol := range pod.Spec.Volumes { ++ if _, inSpec := specHotplugVolumes[podVol.Name]; inSpec { ++ return true ++ } ++ if vs, ok := statusMap[podVol.Name]; ok && !volumeReadyForPodDelete(vs.Phase) { ++ return true ++ } ++ } ++ return false ++} ++ + func (c *Controller) handleHotplugVolumes(hotplugVolumes []*v1.Volume, hotplugAttachmentPods []*k8sv1.Pod, vmi *v1.VirtualMachineInstance, virtLauncherPod *k8sv1.Pod, dataVolumes []*cdiv1.DataVolume) common.SyncError { + logger := log.Log.Object(vmi) + +-- +2.54.0 + diff --git a/SPECS/kubevirt/kubevirt.spec b/SPECS/kubevirt/kubevirt.spec index ade613ebdbb..1a4f1fb6f49 100644 --- a/SPECS/kubevirt/kubevirt.spec +++ b/SPECS/kubevirt/kubevirt.spec @@ -20,7 +20,7 @@ Summary: Container native virtualization Name: kubevirt Version: 1.7.1 -Release: 8%{?dist} +Release: 9%{?dist} License: ASL 2.0 Vendor: Microsoft Corporation Distribution: Azure Linux @@ -45,6 +45,8 @@ Patch14: CVE-2026-42502.patch Patch15: CVE-2026-7374.patch Patch16: CVE-2026-33814.patch Patch17: 0001-Fix-VM-with-PCI-hostdev-failing-to-restart-after-hot.patch +Patch18: 0002-Fix-hotplug-volume-detach-deadlock-in-virt-handler.patch +Patch19: 0003-Fix-cleanupAttachmentPods-fallback-keeping-useless-old-pods.patch %global debug_package %{nil} BuildRequires: swtpm-tools @@ -282,6 +284,10 @@ install -p -m 0644 cmd/virt-launcher/qemu.conf %{buildroot}%{_datadir}/kube-virt %{_bindir}/virt-tests %changelog +* Wed Jun 10 2026 Max Weiss - 1.7.1-9 +- Add patch for hotplug volume detach deadlock in virt-handler +- Add patch for cleanupAttachmentPods fallback keeping useless old pods + * Wed Jun 10 2026 Woojoong Kim - 1.7.1-8 - Add PCI passthrough patch