From 5186c829b21dcf6b73e16e6bb5751c97d53b61f5 Mon Sep 17 00:00:00 2001
From: Marius Leustean <marius.leustean@sap.com>
Date: Fri, 9 Aug 2024 16:31:47 +0300
Subject: [PATCH] Calculate memory_mb_max_unit based on instance's memory_mb

Previously the memory_mb_max_unit was relying on the
overallMemoryUsage reported by the ESXi, which was wrong, because
the host could still have a big VM with reserved memory, that was
not consuming any RAM and thus not reflected in the memory usage.

We are now looking for the instances spawned on that ESXi and
subtract their configured memory_mb from the total RAM capacity
of the host, resulting in a realistic memory_mb_max_unit.

Change-Id: I72821c97d80f9f675dca943d65f94e36824d81b6
---
 .../filters/hana_memory_max_unit_filter.py    | 10 +++
 nova/scheduler/weights/hana_binpack.py        |  5 ++
 .../test_hana_memory_max_unit_filter.py       | 28 ++++++--
 .../scheduler/weights/test_hana_binpack.py    | 31 +++++++--
 .../unit/virt/vmwareapi/test_driver_api.py    | 25 ++++++-
 nova/tests/unit/virt/vmwareapi/test_vmops.py  | 66 +++++++++++++++++++
 nova/utils.py                                 |  2 +
 nova/virt/vmwareapi/driver.py                 | 10 ++-
 nova/virt/vmwareapi/host.py                   |  6 ++
 nova/virt/vmwareapi/special_spawning.py       |  9 ++-
 nova/virt/vmwareapi/vm_util.py                |  8 ++-
 nova/virt/vmwareapi/vmops.py                  | 39 +++++++++++
 12 files changed, 216 insertions(+), 23 deletions(-)

diff --git a/nova/scheduler/filters/hana_memory_max_unit_filter.py b/nova/scheduler/filters/hana_memory_max_unit_filter.py
index a5d4e3f1285..0d51d5eeffc 100644
--- a/nova/scheduler/filters/hana_memory_max_unit_filter.py
+++ b/nova/scheduler/filters/hana_memory_max_unit_filter.py
@@ -19,6 +19,7 @@
 from nova.scheduler import filters
 from nova.scheduler import utils
 from nova.utils import BIGVM_EXCLUSIVE_TRAIT
+from nova.utils import HANA_MANUAL_SCHEDULING_TRAIT
 
 LOG = logging.getLogger(__name__)
 
@@ -40,6 +41,15 @@ def host_passes(self, host_state, spec_obj):
         if extra_specs.get(trait) != "required":
             return True
 
+        if (host_state.traits and
+                HANA_MANUAL_SCHEDULING_TRAIT in host_state.traits):
+            LOG.debug("Manual scheduling for HANA was enabled by trait "
+                      "%(trait)s for the host %(host_state)s",
+                      {"trait": HANA_MANUAL_SCHEDULING_TRAIT,
+                       "host_state": host_state})
+        else:
+            return False
+
         memory_mb_max_unit = utils.get_memory_mb_max_unit(host_state)
 
         if memory_mb_max_unit is None:
diff --git a/nova/scheduler/weights/hana_binpack.py b/nova/scheduler/weights/hana_binpack.py
index e92c5778928..df6ab735025 100644
--- a/nova/scheduler/weights/hana_binpack.py
+++ b/nova/scheduler/weights/hana_binpack.py
@@ -25,6 +25,7 @@
 from nova.scheduler import utils
 from nova.scheduler import weights
 from nova.utils import BIGVM_EXCLUSIVE_TRAIT
+from nova.utils import HANA_MANUAL_SCHEDULING_TRAIT
 
 CONF = nova.conf.CONF
 
@@ -45,6 +46,10 @@ def _weigh_object(self, host_state, weight_properties):
         if memory_mb_max_unit is None:
             return 0
 
+        if (not host_state.traits or
+                HANA_MANUAL_SCHEDULING_TRAIT not in host_state.traits):
+            return 0
+
         trait = f"trait:{BIGVM_EXCLUSIVE_TRAIT}"
         extra_specs = weight_properties.flavor.extra_specs
         if extra_specs.get(trait) == "required":
diff --git a/nova/tests/unit/scheduler/filters/test_hana_memory_max_unit_filter.py b/nova/tests/unit/scheduler/filters/test_hana_memory_max_unit_filter.py
index 548d6ecf718..d89a4059e43 100644
--- a/nova/tests/unit/scheduler/filters/test_hana_memory_max_unit_filter.py
+++ b/nova/tests/unit/scheduler/filters/test_hana_memory_max_unit_filter.py
@@ -20,6 +20,7 @@
 from nova.scheduler.filters import hana_memory_max_unit_filter
 from nova import test
 from nova.tests.unit.scheduler import fakes
+from nova.utils import HANA_MANUAL_SCHEDULING_TRAIT
 
 
 @ddt.ddt
@@ -31,7 +32,8 @@ def setUp(self):
 
     def test_passes(self):
         host = fakes.FakeHostState('host1', 'compute', {
-            'stats': {'memory_mb_max_unit': '2048'}
+            'stats': {'memory_mb_max_unit': '2048'},
+            'traits': [HANA_MANUAL_SCHEDULING_TRAIT]
         })
         extra_specs = {'trait:CUSTOM_HANA_EXCLUSIVE_HOST': 'required'}
 
@@ -46,7 +48,8 @@ def test_passes(self):
 
     def test_not_pass_too_big_flavor(self):
         host = fakes.FakeHostState('host1', 'compute', {
-            'stats': {'memory_mb_max_unit': '2048'}
+            'stats': {'memory_mb_max_unit': '2048'},
+            'traits': [HANA_MANUAL_SCHEDULING_TRAIT]
         })
         extra_specs = {'trait:CUSTOM_HANA_EXCLUSIVE_HOST': 'required'}
 
@@ -59,11 +62,27 @@ def test_not_pass_too_big_flavor(self):
 
         self.assertFalse(self.filt_cls.host_passes(host, spec_obj))
 
+    def test_not_pass_without_trait(self):
+        host = fakes.FakeHostState('host1', 'compute', {
+            'stats': {'memory_mb_max_unit': '2048'},
+        })
+        extra_specs = {'trait:CUSTOM_HANA_EXCLUSIVE_HOST': 'required'}
+
+        flavor = objects.Flavor(memory_mb=512,
+                                extra_specs=extra_specs)
+
+        spec_obj = objects.RequestSpec(
+            context=mock.sentinel.ctx,
+            flavor=flavor)
+
+        self.assertFalse(self.filt_cls.host_passes(host, spec_obj))
+
     @ddt.data({},
               {'trait:CUSTOM_HANA_EXCLUSIVE_HOST': 'forbidden'})
     def test_passes_non_hana(self, extra_specs):
         host = fakes.FakeHostState('host1', 'compute', {
-            'stats': {'memory_mb_max_unit': '2048'}
+            'stats': {'memory_mb_max_unit': '2048'},
+            'traits': [HANA_MANUAL_SCHEDULING_TRAIT]
         })
 
         flavor = objects.Flavor(memory_mb=3072,
@@ -77,7 +96,8 @@ def test_passes_non_hana(self, extra_specs):
 
     def test_passes_no_maxunit(self):
         host = fakes.FakeHostState('host1', 'compute', {
-            'stats': {}
+            'stats': {},
+            'traits': [HANA_MANUAL_SCHEDULING_TRAIT]
         })
         extra_specs = {'trait:CUSTOM_HANA_EXCLUSIVE_HOST': 'required'}
 
diff --git a/nova/tests/unit/scheduler/weights/test_hana_binpack.py b/nova/tests/unit/scheduler/weights/test_hana_binpack.py
index 268a8fcd28c..c551bef9b3f 100644
--- a/nova/tests/unit/scheduler/weights/test_hana_binpack.py
+++ b/nova/tests/unit/scheduler/weights/test_hana_binpack.py
@@ -21,6 +21,7 @@
 from nova.scheduler.weights import hana_binpack
 from nova import test
 from nova.tests.unit.scheduler import fakes
+from nova.utils import HANA_MANUAL_SCHEDULING_TRAIT
 
 
 @ddt.ddt
@@ -45,11 +46,16 @@ def _get_weighed_host(self, hosts, weight_properties=None,
                 hosts, weight_properties)[0]
 
     def _get_all_hosts(self):
+        traits = [HANA_MANUAL_SCHEDULING_TRAIT]
         host_values = [
-            ('host1', 'node1', {'stats': {'memory_mb_max_unit': '3072'}}),
-            ('host2', 'node2', {'stats': {'memory_mb_max_unit': '8192'}}),
-            ('host3', 'node3', {'stats': {'memory_mb_max_unit': '1024'}}),
-            ('host4', 'node4', {'stats': {'memory_mb_max_unit': '512'}})
+            ('host1', 'node1', {'stats': {'memory_mb_max_unit': '3072'},
+                                'traits': traits}),
+            ('host2', 'node2', {'stats': {'memory_mb_max_unit': '8192'},
+                                'traits': traits}),
+            ('host3', 'node3', {'stats': {'memory_mb_max_unit': '1024'},
+                                'traits': traits}),
+            ('host4', 'node4', {'stats': {'memory_mb_max_unit': '512'},
+                                'traits': traits})
         ]
         return [fakes.FakeHostState(host, node, values)
                 for host, node, values in host_values]
@@ -106,3 +112,20 @@ def test_no_memory_mb_max_unit(self):
                                               self._hana_spec_obj)
         self.assertEqual('host1', weighed_host.obj.host)
         self.assertEqual(0, weighed_host.weight)
+
+    def test_no_host_trait(self):
+        stats = {'memory_mb_max_unit': '3072'}
+        host_list = [
+            fakes.FakeHostState('host1', 'node1',
+                                {'stats': stats}),
+            fakes.FakeHostState('host2', 'node2',
+                                {'stats': stats,
+                                 'traits': []}),
+            fakes.FakeHostState('host3', 'node3',
+                                {'stats': stats,
+                                 'traits': ['CUSTOM_SOME_TRAIT']})
+        ]
+        weighed_host = self._get_weighed_host(host_list,
+                                              self._hana_spec_obj)
+        self.assertEqual('host1', weighed_host.obj.host)
+        self.assertEqual(0, weighed_host.weight)
diff --git a/nova/tests/unit/virt/vmwareapi/test_driver_api.py b/nova/tests/unit/virt/vmwareapi/test_driver_api.py
index eda90778dae..5aab9a2109b 100644
--- a/nova/tests/unit/virt/vmwareapi/test_driver_api.py
+++ b/nova/tests/unit/virt/vmwareapi/test_driver_api.py
@@ -2277,8 +2277,8 @@ def test_get_available_resource(self):
                 stats['supported_instances'])
         memory_mbmax_unit = stats['stats']['memory_mb_max_unit']
         vcpus_max_unit = stats['stats']['vcpus_max_unit']
-        # memory_mb_used = fake.HostSystem.overallMemoryUsage = 500
-        self.assertEqual(memory_mbmax_unit, 1024 - 500)
+        # memory_mb_used = fake.HostSystem.hardware.memorySize = 1024
+        self.assertEqual(memory_mbmax_unit, 1024)
         # vcpus_max_unit = fake.HostSystem.summary.hardware.numCpuThreads
         self.assertEqual(vcpus_max_unit, 16)
         self.assertEqual(mock.sentinel.node_uuid, stats['uuid'])
@@ -2802,3 +2802,24 @@ def test_host_state_service_disabled(self, mock_service):
             self.assertEqual(2, mock_save.call_count)
             self.assertFalse(service.disabled)
             self.assertFalse(self.conn._vc_state._auto_service_disabled)
+
+    @mock.patch.object(vmops.VMwareVMOps,
+                       'get_available_memory_per_host')
+    def test_get_cluster_stats(self, mock_get_mem_per_host):
+        mem_per_host = {
+            'host-1': 1024,
+            'host-2': 4096,
+            'host-3': 0,
+        }
+        mock_get_mem_per_host.return_value = mem_per_host
+
+        host_stats = {
+            'node-001': {'vcpus': 8},
+            'node-002': {'vcpus': 16},
+            self.conn._nodename: {'vcpus': 24}
+        }
+
+        result = self.conn._get_cluster_stats(host_stats)
+
+        self.assertEqual(16, result['vcpus_max_unit'])
+        self.assertEqual(4096, result['memory_mb_max_unit'])
diff --git a/nova/tests/unit/virt/vmwareapi/test_vmops.py b/nova/tests/unit/virt/vmwareapi/test_vmops.py
index 949f09392d2..17a7a5356cf 100644
--- a/nova/tests/unit/virt/vmwareapi/test_vmops.py
+++ b/nova/tests/unit/virt/vmwareapi/test_vmops.py
@@ -4340,3 +4340,69 @@ def test_resize_with_no_evc_mode(self, fake_drs_override,
         self._vmops._resize_vm(self._context, instance, vm_ref, flavor, None)
         fake_apply_evc_mode.assert_called_once_with(
             self._session, mock.sentinel.vm_ref, None)
+
+    @ddt.unpack
+    @ddt.data(
+        # test 1
+        ([
+             {'host': dict(memory_mb=1024),
+              'instances_mem': [512]},
+             {'host': dict(memory_mb=4096,
+                           memory_mb_reserved=2048),
+              'instances_mem': []},
+             {'host': dict(memory_mb=8192,
+                           available=False),
+              'instances_mem': []}
+         ],
+         # available memory per host
+         # expect only hosts in valid state to be returned
+         [512, 2048]),
+
+        # test 2
+        ([
+             {'host': dict(memory_mb=1024),
+              'instances_mem': [256, 256, 256]},
+             {'host': dict(memory_mb=1024),
+              'instances_mem': [512, 512, 512]},
+             {'host': dict(memory_mb=2048,
+                           memory_mb_reserved=512),
+              'instances_mem': [512, 512, 512]}
+         ],
+         # available memory per host
+         # second host is over-provisioned, but we should still
+         # return 0 available memory MB.
+         [256, 0, 0])
+
+    )
+    def test_get_available_memory_per_host(self,
+                                           hosts_with_instances,
+                                           expected_results):
+        def _fake_host_stat(available=True,
+                            memory_mb=0,
+                            memory_mb_reserved=0):
+            return {
+                "available": available,
+                "memory_mb": memory_mb,
+                "memory_mb_reserved": memory_mb_reserved
+            }
+
+        stats_per_host = {}
+        instances_ret = []
+
+        for idx, hi in enumerate(hosts_with_instances):
+            stats = _fake_host_stat(**hi['host'])
+            mo_id = 'host-%s' % idx
+            stats_per_host[mo_id] = stats
+            for mem in hi['instances_mem']:
+                instances_ret.append(
+                    (uuidutils.generate_uuid(),
+                     {'runtime.host': mo_id,
+                      'config.hardware.memoryMB': mem}))
+
+        with test.nested(
+                mock.patch.object(self._vmops, '_list_instances_in_cluster',
+                                  return_value=instances_ret),
+        ):
+            result = self._vmops.get_available_memory_per_host(
+                stats_per_host)
+            self.assertEqual(list(result.values()), expected_results)
diff --git a/nova/utils.py b/nova/utils.py
index a1352606eb7..0cde1daebd6 100644
--- a/nova/utils.py
+++ b/nova/utils.py
@@ -96,6 +96,8 @@
 EXTERNAL_CUSTOMER_SUPPORTED_TRAIT = 'CUSTOM_EXTERNAL_CUSTOMER_SUPPORTED'
 EXTERNAL_CUSTOMER_DOMAIN_TRAIT = 'CUSTOM_EXTERNAL_CUSTOMER_{}'
 
+HANA_MANUAL_SCHEDULING_TRAIT = 'CUSTOM_HANA_MANUAL_SCHEDULING'
+
 _FILE_CACHE = {}
 
 _SERVICE_TYPES = service_types.ServiceTypes()
diff --git a/nova/virt/vmwareapi/driver.py b/nova/virt/vmwareapi/driver.py
index ff7533c5d8d..5227bd1f6f5 100644
--- a/nova/virt/vmwareapi/driver.py
+++ b/nova/virt/vmwareapi/driver.py
@@ -440,7 +440,10 @@ def _get_cluster_stats(self, host_stats):
         # memory_mb_max_unit represents the free memory of the freest host
         # available in the cluster. Can be used by the scheduler to determine
         # if the cluster can accommodate a big VM.
-        memory_mb_max_unit = 0
+        # It's calculated by looking at the configured memoryMB of instances
+        # in that host, not taking into account the actual memory usage.
+        memory_mb_max_unit = max(self._vmops.get_available_memory_per_host(
+            self._vc_state.per_host_stats).values())
         # vcpus_max_unit represents the vCPUs of a physical node from the
         # cluster. It can be used to determine if a VM can be powered-on in
         # this cluster, by comparing its flavor.vcpus to this value.
@@ -449,11 +452,6 @@ def _get_cluster_stats(self, host_stats):
             # Skip the cluster node
             if nodename == self._nodename:
                 continue
-            memory_mb_free = (resources['memory_mb'] -
-                              resources['memory_mb_used'])
-            if memory_mb_max_unit < memory_mb_free:
-                memory_mb_max_unit = memory_mb_free
-
             vcpus = resources['vcpus']
             if vcpus_max_unit < vcpus:
                 vcpus_max_unit = vcpus
diff --git a/nova/virt/vmwareapi/host.py b/nova/virt/vmwareapi/host.py
index 76ce144e453..bb3925e8e4d 100644
--- a/nova/virt/vmwareapi/host.py
+++ b/nova/virt/vmwareapi/host.py
@@ -68,6 +68,7 @@ def __init__(self, session, cluster_node_name, cluster, datastore_regex):
         self._cluster = cluster
         self._datastore_regex = datastore_regex
         self._stats = {}
+        self._per_host_stats = {}
         ctx = context.get_admin_context()
         try:
             service = objects.Service.get_by_compute_host(ctx, CONF.host)
@@ -154,6 +155,7 @@ def update_status(self):
                 data[h_name].setdefault('hostgroups', []).append(hg.name)
 
         self._stats = data
+        self._per_host_stats = per_host_stats
         if self._auto_service_disabled:
             self._set_host_enabled(True)
         return data
@@ -162,6 +164,10 @@ def update_status(self):
     def hostgroups(self):
         return self._stats[self._cluster_node_name].get('hostgroups', {})
 
+    @property
+    def per_host_stats(self):
+        return self._per_host_stats or {}
+
     def _merge_stats(self, host, stats, defaults):
         result = deepcopy(defaults)
         result["hypervisor_hostname"] = host
diff --git a/nova/virt/vmwareapi/special_spawning.py b/nova/virt/vmwareapi/special_spawning.py
index b1e64c5589f..f0d8f15485d 100644
--- a/nova/virt/vmwareapi/special_spawning.py
+++ b/nova/virt/vmwareapi/special_spawning.py
@@ -28,6 +28,7 @@
 from nova.virt.vmwareapi import cluster_util
 from nova.virt.vmwareapi import constants
 from nova.virt.vmwareapi import vim_util
+from nova.virt.vmwareapi import vm_util
 from nova.virt.vmwareapi.vm_util import propset_dict
 
 LOG = logging.getLogger(__name__)
@@ -255,9 +256,8 @@ def free_host(self, context):
                     host_props = propset_dict(obj.propSet)
                     runtime_summary = host_props['summary.runtime']
                     moref_value = vutil.get_moref_value(obj.obj)
-                    host_states[moref_value] = (
-                        runtime_summary.inMaintenanceMode is False and
-                        runtime_summary.connectionState == "connected")
+                    host_states[moref_value] = \
+                        vm_util.is_host_available(runtime_summary)
 
             vms_per_host = {h: vms for h, vms in vms_per_host.items()
                             if host_states[h]}
@@ -313,8 +313,7 @@ def free_host(self, context):
 
             runtime_summary = self._session._call_method(
                 vutil, "get_object_property", host_ref, 'summary.runtime')
-            if (runtime_summary.inMaintenanceMode is True or
-                    runtime_summary.connectionState != "connected"):
+            if not vm_util.is_host_available(runtime_summary):
                 LOG.warning('Host destined for spawning was set to '
                             'maintenance or became disconnected.')
                 return FREE_HOST_STATE_ERROR
diff --git a/nova/virt/vmwareapi/vm_util.py b/nova/virt/vmwareapi/vm_util.py
index d847a3d89c6..caf0dea1c25 100644
--- a/nova/virt/vmwareapi/vm_util.py
+++ b/nova/virt/vmwareapi/vm_util.py
@@ -1749,8 +1749,7 @@ def _process_host_stats(obj, host_reservations_map):
     mem_mb = getattr(hardware_summary, "memorySize", 0) // units.Mi
 
     stats = {
-        "available": (not runtime_summary.inMaintenanceMode and
-                      runtime_summary.connectionState == "connected"),
+        "available": is_host_available(runtime_summary),
         "name": host_props["name"],
         "vcpus": threads,
         "vcpus_used": 0,
@@ -2423,3 +2422,8 @@ def apply_evc_mode(session, vm_ref, evc_mode):
     task = session._call_method(session.vim, "ApplyEvcModeVM_Task", vm_ref,
                                 mask=feature_masks)
     session._wait_for_task(task)
+
+
+def is_host_available(runtime_summary):
+    return (not runtime_summary.inMaintenanceMode and
+            runtime_summary.connectionState == "connected")
diff --git a/nova/virt/vmwareapi/vmops.py b/nova/virt/vmwareapi/vmops.py
index 23335f10cea..8d5f4b9e092 100644
--- a/nova/virt/vmwareapi/vmops.py
+++ b/nova/virt/vmwareapi/vmops.py
@@ -4813,3 +4813,42 @@ def check_can_live_migrate_source(self, instance):
                 raise exception.MigrationPreCheckError(
                         reason=("Found non-configdrive CD-ROM. "
                                 "Can only migrate configdrive"))
+
+    def get_available_memory_per_host(self, per_host_stats):
+        """Retrieves per-host available RAM.
+
+        The value is calculated by subtracting the total configured
+        memory_mb of the instances residing on that host from the
+        total memory size of the host.
+
+        Returns a dict containing available RAM information per host.
+        {
+            "host-ref-value": 1024
+        }
+        """
+        mem_per_host = {}
+
+        for host_ref_value, host_stats in per_host_stats.items():
+            if not host_stats["available"]:
+                continue
+            total_mem_mb = host_stats.get('memory_mb', 0)
+            reserved_mem_mb = host_stats.get('memory_mb_reserved', 0)
+            mem_per_host[host_ref_value] = total_mem_mb - reserved_mem_mb
+
+        props = ['config.hardware.memoryMB', 'runtime.host']
+        vms = self._list_instances_in_cluster(additional_properties=props)
+
+        for (vm_uuid, vm_props) in vms:
+            host_obj = vm_props.get('runtime.host')
+            if not host_obj:
+                continue
+            host_ref_value = vutil.get_moref_value(host_obj)
+            if host_ref_value not in mem_per_host:
+                continue
+
+            vm_mb = vm_props.get('config.hardware.memoryMB', 0)
+            # make sure the minimum available memory >= 0
+            mem_per_host[host_ref_value] = \
+                max(mem_per_host[host_ref_value] - vm_mb, 0)
+
+        return mem_per_host