From 8d2ee362b9bf16a52c9589917e7f4ae06e954156 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov Date: Thu, 26 Mar 2026 14:31:44 +0100 Subject: [PATCH] [CloudRift] Fix NTP clock skew breaking Docker; handle amd-smi 7.x output format CloudRift VMs boot with an incorrect RTC clock (~1h ahead). When NTP corrects it backwards, Docker discards container exit events, leaving containers stuck as ghosts forever. Add NTP sync wait before launching the shim to prevent this. Also handle both amd-smi output formats (flat array in ROCm 6.x, wrapped {"gpu_data": [...]} in ROCm 7.x) and add a 2-minute timeout to AMD GPU detection to prevent the shim from hanging indefinitely. Co-Authored-By: Claude Opus 4.6 (1M context) --- runner/internal/shim/host/gpu.go | 28 +++++++++++++++++-- .../core/backends/cloudrift/api_client.py | 19 +++++++++---- .../core/backends/cloudrift/compute.py | 16 ++++++++++- 3 files changed, 55 insertions(+), 8 deletions(-) diff --git a/runner/internal/shim/host/gpu.go b/runner/internal/shim/host/gpu.go index 0452f1ff46..10cbc7307f 100644 --- a/runner/internal/shim/host/gpu.go +++ b/runner/internal/shim/host/gpu.go @@ -10,6 +10,7 @@ import ( "path/filepath" "strconv" "strings" + "time" execute "github.com/alexellis/go-execute/v2" @@ -114,6 +115,11 @@ type amdGpu struct { Bus amdBus `json:"bus"` } +// amd-smi >= 7.x wraps the array in {"gpu_data": [...]} +type amdSmiOutput struct { + GpuData []amdGpu `json:"gpu_data"` +} + type amdAsic struct { Name string `json:"market_name"` } @@ -130,9 +136,27 @@ type amdBus struct { BDF string `json:"bdf"` // PCIe Domain:Bus:Device.Function notation } +// parseAmdSmiOutput handles both amd-smi output formats: +// ROCm 6.x returns a flat array: [{"gpu": 0, ...}, ...] +// ROCm 7.x wraps it: {"gpu_data": [{"gpu": 0, ...}, ...]} +func parseAmdSmiOutput(data []byte) ([]amdGpu, error) { + var amdGpus []amdGpu + if err := json.Unmarshal(data, &amdGpus); err == nil { + return amdGpus, nil + } + var wrapped amdSmiOutput + if err := json.Unmarshal(data, &wrapped); err != nil { + return nil, err + } + return wrapped.GpuData, nil +} + func getAmdGpuInfo(ctx context.Context) []GpuInfo { gpus := []GpuInfo{} + ctx, cancel := context.WithTimeout(ctx, 2*time.Minute) + defer cancel() + cmd := execute.ExecTask{ Command: "docker", Args: []string{ @@ -158,8 +182,8 @@ func getAmdGpuInfo(ctx context.Context) []GpuInfo { return gpus } - var amdGpus []amdGpu - if err := json.Unmarshal([]byte(res.Stdout), &amdGpus); err != nil { + amdGpus, err := parseAmdSmiOutput([]byte(res.Stdout)) + if err != nil { log.Error(ctx, "cannot read json", "err", err) return gpus } diff --git a/src/dstack/_internal/core/backends/cloudrift/api_client.py b/src/dstack/_internal/core/backends/cloudrift/api_client.py index c8f2732f7d..d3bb425e9b 100644 --- a/src/dstack/_internal/core/backends/cloudrift/api_client.py +++ b/src/dstack/_internal/core/backends/cloudrift/api_client.py @@ -72,12 +72,16 @@ def get_vm_recipies(self) -> List[Dict]: return vm_recipes - def get_vm_image_url(self) -> Optional[str]: + def get_vm_image_url(self, gpu_vendor: Optional[str] = None) -> Optional[str]: recipes = self.get_vm_recipies() + if gpu_vendor == "amd": + driver_tag = "amd-driver" + else: + driver_tag = "nvidia-driver" + ubuntu_images = [] for recipe in recipes: - has_nvidia_driver = "nvidia-driver" in recipe.get("tags", []) - if not has_nvidia_driver: + if driver_tag not in recipe.get("tags", []): continue recipe_name = recipe.get("name", "") @@ -97,9 +101,14 @@ def get_vm_image_url(self) -> Optional[str]: return None def deploy_instance( - self, instance_type: str, region: str, ssh_keys: List[str], cmd: str + self, + instance_type: str, + region: str, + ssh_keys: List[str], + cmd: str, + gpu_vendor: Optional[str] = None, ) -> List[str]: - image_url = self.get_vm_image_url() + image_url = self.get_vm_image_url(gpu_vendor=gpu_vendor) if not image_url: raise BackendError("No suitable VM image found.") diff --git a/src/dstack/_internal/core/backends/cloudrift/compute.py b/src/dstack/_internal/core/backends/cloudrift/compute.py index d2bd4fc755..30f1a431f6 100644 --- a/src/dstack/_internal/core/backends/cloudrift/compute.py +++ b/src/dstack/_internal/core/backends/cloudrift/compute.py @@ -73,17 +73,31 @@ def create_instance( instance_config: InstanceConfiguration, placement_group: Optional[PlacementGroup], ) -> JobProvisioningData: - commands = get_shim_commands() + # TODO: Remove once CloudRift fixes their VM RTC clock. + # Wrong RTC + NTP backward jump breaks Docker container lifecycle. + ntp_sync_commands = [ + ( + "timeout 60 bash -c '" + "while ! timedatectl show -p NTPSynchronized --value | grep -q yes;" + " do sleep 1; done' || true" + ), + ] + commands = ntp_sync_commands + get_shim_commands() startup_script = " ".join([" && ".join(commands)]) logger.debug( f"Creating instance for offer {instance_offer.instance.name} in region {instance_offer.region} with commands: {startup_script}" ) + gpu_vendor = None + if instance_offer.instance.resources.gpus: + gpu_vendor = instance_offer.instance.resources.gpus[0].vendor.value + instance_ids = self.client.deploy_instance( instance_type=instance_offer.instance.name, region=instance_offer.region, ssh_keys=instance_config.get_public_keys(), cmd=startup_script, + gpu_vendor=gpu_vendor, ) if len(instance_ids) == 0: