From dc85f47b0d762f6918af27794076e387f4a76ba4 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 21 Apr 2026 16:03:06 +0200 Subject: [PATCH 01/29] lxc/cgroups: drop cgroup1 freezer support Signed-off-by: Alexander Mikhalitsyn --- src/lxc/cgroups/cgfsng.c | 32 ++++---------------------------- 1 file changed, 4 insertions(+), 28 deletions(-) diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c index fcaea291fc..c92473a43f 100644 --- a/src/lxc/cgroups/cgfsng.c +++ b/src/lxc/cgroups/cgfsng.c @@ -2576,18 +2576,6 @@ __cgfsng_ops static bool cgfsng_criu_get_hierarchies(struct cgroup_ops *ops, return true; } -static int cg_legacy_freeze(struct cgroup_ops *ops) -{ - struct hierarchy *h; - - h = get_hierarchy(ops, "freezer"); - if (!h) - return ret_set_errno(-1, ENOENT); - - return lxc_write_openat(h->path_con, "freezer.state", - "FROZEN", STRLITERALLEN("FROZEN")); -} - static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata, struct lxc_async_descr *descr) { @@ -2680,24 +2668,12 @@ __cgfsng_ops static int cgfsng_freeze(struct cgroup_ops *ops, int timeout) if (!ops->hierarchies) return ret_set_errno(-1, ENOENT); - if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED) - return cg_legacy_freeze(ops); + if (!pure_unified_layout(ops)) + return ret_set_errno(-1, EOPNOTSUPP); return cg_unified_freeze(ops, timeout); } -static int cg_legacy_unfreeze(struct cgroup_ops *ops) -{ - struct hierarchy *h; - - h = get_hierarchy(ops, "freezer"); - if (!h) - return ret_set_errno(-1, ENOENT); - - return lxc_write_openat(h->path_con, "freezer.state", - "THAWED", STRLITERALLEN("THAWED")); -} - static int cg_unified_unfreeze(struct cgroup_ops *ops, int timeout) { return cg_unified_freeze_do(ops, timeout, "0", 0, @@ -2710,8 +2686,8 @@ __cgfsng_ops static int cgfsng_unfreeze(struct cgroup_ops *ops, int timeout) if (!ops->hierarchies) return ret_set_errno(-1, ENOENT); - if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED) - return cg_legacy_unfreeze(ops); + if (!pure_unified_layout(ops)) + return ret_set_errno(-1, EOPNOTSUPP); return cg_unified_unfreeze(ops, timeout); } From 8c8da5ed3343e63dea9f4c09f82c437236d3f10e Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 21 Apr 2026 16:05:14 +0200 Subject: [PATCH 02/29] lxc/cgroup: drop cgroup1 device cgroup support Signed-off-by: Alexander Mikhalitsyn --- src/lxc/cgroups/cgfsng.c | 140 --------------------------------------- src/lxc/cgroups/cgroup.h | 2 - src/lxc/start.c | 16 ----- 3 files changed, 158 deletions(-) diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c index c92473a43f..557b0cd32d 100644 --- a/src/lxc/cgroups/cgfsng.c +++ b/src/lxc/cgroups/cgfsng.c @@ -791,16 +791,6 @@ static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf, TRACE("Created limit cgroup %d->%d(%s)", h->dfd_lim, h->dfd_base, cgroup_limit_dir); - /* - * With isolation the devices legacy cgroup needs to be - * iinitialized early, as it typically contains an 'a' (all) - * line, which is not possible once a subdirectory has been - * created. - */ - if (string_in_list(h->controllers, "devices") && - !ops->setup_limits_legacy(ops, conf, true)) - return log_warn(false, "Failed to setup legacy device limits"); - /* * If we use a separate limit cgroup, the leaf cgroup, i.e. the * cgroup the container actually resides in, is below fd_limit. @@ -3346,135 +3336,6 @@ static int device_cgroup_rule_parse_devpath(struct device_item *device, return 0; } -static int convert_devpath(const char *invalue, char *dest) -{ - struct device_item device = {}; - int ret; - - ret = device_cgroup_rule_parse_devpath(&device, invalue); - if (ret < 0) - return -1; - - ret = strnprintf(dest, 50, "%c %d:%d %s", device.type, device.major, - device.minor, device.access); - if (ret < 0) - return log_error_errno(ret, -ret, - "Error on configuration value \"%c %d:%d %s\" (max 50 chars)", - device.type, device.major, device.minor, - device.access); - - return 0; -} - -/* Called from setup_limits - here we have the container's cgroup_data because - * we created the cgroups. - */ -static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename, - const char *value, bool is_cpuset) -{ - __do_free char *controller = NULL; - char *p; - /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */ - char converted_value[50]; - struct hierarchy *h; - - controller = strdup(filename); - if (!controller) - return ret_errno(ENOMEM); - - p = strchr(controller, '.'); - if (p) - *p = '\0'; - - if (strequal("devices.allow", filename) && value[0] == '/') { - int ret; - - ret = convert_devpath(value, converted_value); - if (ret < 0) - return ret; - value = converted_value; - } - - h = get_hierarchy(ops, controller); - if (!h) - return log_error_errno(-ENOENT, ENOENT, "Failed to setup limits for the \"%s\" controller. The controller seems to be unused by \"cgfsng\" cgroup driver or not enabled on the cgroup hierarchy", controller); - - if (is_cpuset) { - int ret = lxc_write_openat(h->path_con, filename, value, strlen(value)); - if (ret) - return ret; - } - return lxc_write_openat(h->path_lim, filename, value, strlen(value)); -} - -/* - * Return the list of cgroup_settings sorted according to the following rules - * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes - */ -static void sort_cgroup_settings(struct lxc_conf *conf) -{ - LIST_HEAD(memsw_list); - struct lxc_cgroup *cgroup, *ncgroup; - - /* Iterate over the cgroup settings and copy them to the output list. */ - list_for_each_entry_safe(cgroup, ncgroup, &conf->cgroup, head) { - if (!strequal(cgroup->subsystem, "memory.memsw.limit_in_bytes")) - continue; - - /* Move the memsw entry from the cgroup settings list. */ - list_move_tail(&cgroup->head, &memsw_list); - } - - /* - * Append all the memsw entries to the end of the cgroup settings list - * to make sure they are applied after all memory limit settings. - */ - list_splice_tail(&memsw_list, &conf->cgroup); - -} - -__cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops, - struct lxc_conf *conf, - bool do_devices) -{ - struct list_head *cgroup_settings; - struct lxc_cgroup *cgroup; - - if (!ops) - return ret_set_errno(false, ENOENT); - - if (!conf) - return ret_set_errno(false, EINVAL); - - cgroup_settings = &conf->cgroup; - if (list_empty(cgroup_settings)) - return true; - - if (!ops->hierarchies) - return ret_set_errno(false, EINVAL); - - if (pure_unified_layout(ops)) - return log_warn_errno(true, EINVAL, "Ignoring legacy cgroup limits on pure cgroup2 system"); - - sort_cgroup_settings(conf); - list_for_each_entry(cgroup, cgroup_settings, head) { - if (do_devices == strnequal("devices", cgroup->subsystem, 7)) { - if (cg_legacy_set_data(ops, cgroup->subsystem, cgroup->value, strnequal("cpuset", cgroup->subsystem, 6))) { - if (do_devices && (errno == EACCES || errno == EPERM)) { - SYSWARN("Failed to set \"%s\" to \"%s\"", cgroup->subsystem, cgroup->value); - continue; - } - SYSERROR("Failed to set \"%s\" to \"%s\"", cgroup->subsystem, cgroup->value); - return false; - } - DEBUG("Set controller \"%s\" set to \"%s\"", cgroup->subsystem, cgroup->value); - } - } - - INFO("Limits for the legacy cgroup hierarchies have been setup"); - return true; -} - /* * Some of the parsing logic comes from the original cgroup device v1 * implementation in the kernel. @@ -4185,7 +4046,6 @@ struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf) cgfsng_ops->set = cgfsng_set; cgfsng_ops->freeze = cgfsng_freeze; cgfsng_ops->unfreeze = cgfsng_unfreeze; - cgfsng_ops->setup_limits_legacy = cgfsng_setup_limits_legacy; cgfsng_ops->setup_limits = cgfsng_setup_limits; cgfsng_ops->driver = "cgfsng"; cgfsng_ops->version = "1.0.0"; diff --git a/src/lxc/cgroups/cgroup.h b/src/lxc/cgroups/cgroup.h index 108e5d84ec..54c34530b9 100644 --- a/src/lxc/cgroups/cgroup.h +++ b/src/lxc/cgroups/cgroup.h @@ -262,8 +262,6 @@ struct cgroup_ops { size_t len, const char *name, const char *lxcpath); int (*freeze)(struct cgroup_ops *ops, int timeout); int (*unfreeze)(struct cgroup_ops *ops, int timeout); - bool (*setup_limits_legacy)(struct cgroup_ops *ops, - struct lxc_conf *conf, bool with_devices); bool (*setup_limits)(struct cgroup_ops *ops, struct lxc_handler *handler); bool (*chown)(struct cgroup_ops *ops, struct lxc_conf *conf); bool (*attach)(struct cgroup_ops *ops, const struct lxc_conf *conf, diff --git a/src/lxc/start.c b/src/lxc/start.c index c64e78e93f..7f269805e0 100644 --- a/src/lxc/start.c +++ b/src/lxc/start.c @@ -2020,11 +2020,6 @@ static int lxc_spawn(struct lxc_handler *handler) } } - if (!cgroup_ops->setup_limits_legacy(cgroup_ops, handler->conf, false)) { - ERROR("Failed to setup cgroup limits for container \"%s\"", name); - goto out_delete_net; - } - if (!cgroup_ops->payload_delegate_controllers(cgroup_ops)) { ERROR("Failed to delegate controllers to payload cgroup"); goto out_delete_net; @@ -2113,17 +2108,6 @@ static int lxc_spawn(struct lxc_handler *handler) if (!lxc_sync_wait_child(handler, START_SYNC_CGROUP_LIMITS)) goto out_delete_net; - /* - * With isolation the limiting devices cgroup was already setup, so - * only setup devices here if we have no namespace directory. - */ - if (!handler->conf->cgroup_meta.namespace_dir && - !cgroup_ops->setup_limits_legacy(cgroup_ops, handler->conf, true)) { - ERROR("Failed to setup legacy device cgroup controller limits"); - goto out_delete_net; - } - TRACE("Set up legacy device cgroup controller limits"); - if (!cgroup_ops->devices_activate(cgroup_ops, handler)) { ERROR("Failed to setup cgroup2 device controller limits"); goto out_delete_net; From b4e57865561884baa6af2224c2d0e7a3cad9a3ed Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 21 Apr 2026 16:22:49 +0200 Subject: [PATCH 03/29] lxc/cgroups: drop special handling logic for cgroup1 cpuset controller Signed-off-by: Alexander Mikhalitsyn --- src/lxc/cgroups/cgfsng.c | 278 +-------------------------------------- 1 file changed, 6 insertions(+), 272 deletions(-) diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c index 557b0cd32d..cf27b86c57 100644 --- a/src/lxc/cgroups/cgfsng.c +++ b/src/lxc/cgroups/cgfsng.c @@ -184,138 +184,6 @@ int prepare_cgroup_fd(const struct cgroup_ops *ops, struct cgroup_fd *fd, bool l return 0; } -/* Create cpumask from cpulist aka turn: - * - * 0,2-3 - * - * into bit array - * - * 1 0 1 1 - */ -static int lxc_cpumask(char *buf, __u32 **bitarr, __u32 *last_set_bit) -{ - __do_free __u32 *arr_u32 = NULL; - __u32 cur_last_set_bit = 0, nbits = 256; - __u32 nr_u32; - char *token; - - nr_u32 = BITS_TO_LONGS(nbits); - arr_u32 = zalloc(nr_u32 * sizeof(__u32)); - if (!arr_u32) - return ret_errno(ENOMEM); - - lxc_iterate_parts(token, buf, ",") { - __u32 last_bit, first_bit; - char *range; - - errno = 0; - first_bit = strtoul(token, NULL, 0); - last_bit = first_bit; - range = strchr(token, '-'); - if (range) - last_bit = strtoul(range + 1, NULL, 0); - - if (!(first_bit <= last_bit)) - return ret_errno(EINVAL); - - if (last_bit >= nbits) { - __u32 add_bits = last_bit - nbits + 32; - __u32 new_nr_u32; - __u32 *p; - - new_nr_u32 = BITS_TO_LONGS(nbits + add_bits); - p = realloc(arr_u32, new_nr_u32 * sizeof(uint32_t)); - if (!p) - return ret_errno(ENOMEM); - arr_u32 = move_ptr(p); - - memset(arr_u32 + nr_u32, 0, - (new_nr_u32 - nr_u32) * sizeof(uint32_t)); - nbits += add_bits; - } - - while (first_bit <= last_bit) - set_bit(first_bit++, arr_u32); - - if (last_bit > cur_last_set_bit) - cur_last_set_bit = last_bit; - } - - *last_set_bit = cur_last_set_bit; - *bitarr = move_ptr(arr_u32); - return 0; -} - -static int lxc_cpumask_update(char *buf, __u32 *bitarr, __u32 last_set_bit, - bool clear) -{ - bool flipped = false; - char *token; - - lxc_iterate_parts(token, buf, ",") { - __u32 last_bit, first_bit; - char *range; - - errno = 0; - first_bit = strtoul(token, NULL, 0); - last_bit = first_bit; - range = strchr(token, '-'); - if (range) - last_bit = strtoul(range + 1, NULL, 0); - - if (!(first_bit <= last_bit)) { - WARN("The cup range seems to be inverted: %u-%u", first_bit, last_bit); - continue; - } - - if (last_bit > last_set_bit) - continue; - - while (first_bit <= last_bit) { - if (clear && is_set(first_bit, bitarr)) { - flipped = true; - clear_bit(first_bit, bitarr); - } else if (!clear && !is_set(first_bit, bitarr)) { - flipped = true; - set_bit(first_bit, bitarr); - } - - first_bit++; - } - } - - if (flipped) - return 1; - - return 0; -} - -/* Turn cpumask into simple, comma-separated cpulist. */ -static char *lxc_cpumask_to_cpulist(__u32 *bitarr, __u32 last_set_bit) -{ - __do_free_string_list char **cpulist = NULL; - char numstr[INTTYPE_TO_STRLEN(__u32)] = {0}; - int ret; - - for (__u32 bit = 0; bit <= last_set_bit; bit++) { - if (!is_set(bit, bitarr)) - continue; - - ret = strnprintf(numstr, sizeof(numstr), "%u", bit); - if (ret < 0) - return NULL; - - ret = lxc_append_string(&cpulist, numstr); - if (ret < 0) - return ret_set_errno(NULL, ENOMEM); - } - - if (!cpulist) - return ret_set_errno(NULL, ENOMEM); - - return lxc_string_join(",", (const char **)cpulist, false); -} - static inline bool is_unified_hierarchy(const struct hierarchy *h) { return h->fs_type == UNIFIED_HIERARCHY; @@ -580,131 +448,8 @@ __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops, SYSWARN("Failed to destroy cgroups"); } -#define __ISOL_CPUS "/sys/devices/system/cpu/isolated" -#define __OFFLINE_CPUS "/sys/devices/system/cpu/offline" -static bool cpuset1_cpus_initialize(int dfd_parent, int dfd_child, - bool am_initialized) -{ - __do_free char *cpulist = NULL, *isolcpus = NULL, - *offlinecpus = NULL, *posscpus = NULL; - __do_free __u32 *possmask = NULL; - int ret; - __u32 poss_last_set_bit = 0; - -#if !IS_BIONIC - posscpus = read_file_at(dfd_parent, "cpuset.cpus", PROTECT_OPEN, 0); -#else - posscpus = read_file_at(dfd_parent, "cpus", PROTECT_OPEN, 0); -#endif - if (!posscpus) - return log_error_errno(false, errno, "Failed to read file %d/cpuset.cpus", dfd_parent); - - if (file_exists(__ISOL_CPUS)) { - isolcpus = read_file_at(-EBADF, __ISOL_CPUS, PROTECT_OPEN, 0); - if (!isolcpus) - return log_error_errno(false, errno, "Failed to read file \"%s\"", __ISOL_CPUS); - - if (!isdigit(isolcpus[0])) - free_disarm(isolcpus); - } else { - TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist"); - } - - if (file_exists(__OFFLINE_CPUS)) { - offlinecpus = read_file_at(-EBADF, __OFFLINE_CPUS, PROTECT_OPEN, 0); - if (!offlinecpus) - return log_error_errno(false, errno, "Failed to read file \"%s\"", __OFFLINE_CPUS); - - if (!isdigit(offlinecpus[0])) - free_disarm(offlinecpus); - } else { - TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist"); - } - - if (!isolcpus && !offlinecpus) { - cpulist = move_ptr(posscpus); - goto copy_parent; - } - - ret = lxc_cpumask(posscpus, &possmask, &poss_last_set_bit); - if (ret) - return log_error_errno(false, errno, "Failed to create cpumask for possible cpus"); - - if (isolcpus) - ret = lxc_cpumask_update(isolcpus, possmask, poss_last_set_bit, true); - - if (offlinecpus) - ret |= lxc_cpumask_update(offlinecpus, possmask, poss_last_set_bit, true); - - if (!ret) { - cpulist = lxc_cpumask_to_cpulist(possmask, poss_last_set_bit); - TRACE("No isolated or offline cpus present in cpuset"); - } else { - cpulist = move_ptr(posscpus); - TRACE("Removed isolated or offline cpus from cpuset"); - } - if (!cpulist) - return log_error_errno(false, errno, "Failed to create cpu list"); - -copy_parent: - if (!am_initialized) { -#if !IS_BIONIC - ret = lxc_writeat(dfd_child, "cpuset.cpus", cpulist, strlen(cpulist)); -#else - ret = lxc_writeat(dfd_child, "cpus", cpulist, strlen(cpulist)); -#endif - if (ret < 0) - return log_error_errno(false, errno, "Failed to write cpu list to \"%d/cpuset.cpus\"", dfd_child); - - TRACE("Copied cpu settings of parent cgroup"); - } - - return true; -} - -static bool cpuset1_initialize(int dfd_base, int dfd_next) -{ - char mems[PATH_MAX]; - ssize_t bytes; - char v; - - /* Determine whether the base cgroup has cpuset inheritance turned on. */ - bytes = lxc_readat(dfd_base, "cgroup.clone_children", &v, 1); - if (bytes < 0) - return syserror_ret(false, "Failed to read file %d(cgroup.clone_children)", dfd_base); - - /* Initialize cpuset.cpus removing any isolated and offline cpus. */ - if (!cpuset1_cpus_initialize(dfd_base, dfd_next, v == '1')) - return syserror_ret(false, "Failed to initialize cpuset.cpus"); - - /* Read cpuset.mems from parent... */ -#if !IS_BIONIC - bytes = lxc_readat(dfd_base, "cpuset.mems", mems, sizeof(mems)); -#else - bytes = lxc_readat(dfd_base, "mems", mems, sizeof(mems)); -#endif - if (bytes < 0) - return syserror_ret(false, "Failed to read file %d(cpuset.mems)", dfd_base); - - /* and copy to first cgroup in the tree... */ -#if !IS_BIONIC - bytes = lxc_writeat(dfd_next, "cpuset.mems", mems, bytes); -#else - bytes = lxc_writeat(dfd_next, "mems", mems, bytes); -#endif - if (bytes < 0) - return syserror_ret(false, "Failed to write %d(cpuset.mems)", dfd_next); - - /* and finally turn on cpuset inheritance. */ - bytes = lxc_writeat(dfd_next, "cgroup.clone_children", "1", 1); - if (bytes < 0) - return syserror_ret(false, "Failed to write %d(cgroup.clone_children)", dfd_next); - - return log_trace(true, "Initialized cpuset in the legacy hierarchy"); -} - static int __cgroup_tree_create(int dfd_base, const char *path, mode_t mode, - bool cpuset_v1, bool eexist_ignore) + bool eexist_ignore) { __do_close int dfd_final = -EBADF; int dfd_cur = dfd_base; @@ -747,8 +492,7 @@ static int __cgroup_tree_create(int dfd_base, const char *path, mode_t mode, !ret ? " newly created" : "", dfd_base, cur); if (dfd_cur != dfd_base) close(dfd_cur); - else if (cpuset_v1 && !cpuset1_initialize(dfd_base, dfd_final)) - return syserror_set(-EINVAL, "Failed to initialize cpuset controller in the legacy hierarchy"); + /* * Leave dfd_final pointing to the last fd we opened so * it will be automatically zapped if we return early. @@ -771,17 +515,10 @@ static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf, const char *cgroup_leaf, bool payload) { __do_close int fd_limit = -EBADF, fd_final = -EBADF; - bool cpuset_v1 = false; - - /* - * The legacy cpuset controller needs massaging in case inheriting - * settings from its immediate ancestor cgroup hasn't been turned on. - */ - cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset"); if (payload && cgroup_leaf) { /* With isolation both parts need to not already exist. */ - fd_limit = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false); + fd_limit = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, false); if (fd_limit < 0) return syswarn_ret(false, "Failed to create limiting cgroup %d(%s)", h->dfd_base, cgroup_limit_dir); @@ -795,7 +532,7 @@ static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf, * If we use a separate limit cgroup, the leaf cgroup, i.e. the * cgroup the container actually resides in, is below fd_limit. */ - fd_final = __cgroup_tree_create(h->dfd_lim, cgroup_leaf, 0755, cpuset_v1, false); + fd_final = __cgroup_tree_create(h->dfd_lim, cgroup_leaf, 0755, false); if (fd_final < 0) { /* Ensure we don't leave any garbage behind. */ if (cgroup_tree_prune(h->dfd_base, cgroup_limit_dir)) @@ -808,7 +545,7 @@ static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf, h->path_con = must_make_path(h->path_lim, cgroup_leaf, NULL); } else { - fd_final = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false); + fd_final = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, false); if (fd_final < 0) return syswarn_ret(false, "Failed to create %s cgroup %d(%s)", payload ? "payload" : "monitor", h->dfd_base, cgroup_limit_dir); @@ -895,7 +632,6 @@ __cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops, __do_close int fd_pivot = -EBADF; __do_free char *pivot_path = NULL; struct hierarchy *h = ops->hierarchies[i]; - bool cpuset_v1 = false; int ret; /* Monitor might have died before we entered the cgroup. */ @@ -911,9 +647,7 @@ __cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops, else pivot_path = must_make_path(CGROUP_PIVOT, NULL); - cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset"); - - fd_pivot = __cgroup_tree_create(h->dfd_base, pivot_path, 0755, cpuset_v1, true); + fd_pivot = __cgroup_tree_create(h->dfd_base, pivot_path, 0755, true); if (fd_pivot < 0) { SYSWARN("Failed to create pivot cgroup %d(%s)", h->dfd_base, pivot_path); continue; From 14620221cf9a421023b60e11d05ea902bc6ced2a Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 21 Apr 2026 16:42:14 +0200 Subject: [PATCH 04/29] lxc/cgroups: drop cgroup1 mounting logic Signed-off-by: Alexander Mikhalitsyn --- src/lxc/cgroups/cgfsng.c | 186 +-------------------------------------- 1 file changed, 4 insertions(+), 182 deletions(-) diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c index cf27b86c57..6e37a6a068 100644 --- a/src/lxc/cgroups/cgfsng.c +++ b/src/lxc/cgroups/cgfsng.c @@ -1813,75 +1813,6 @@ __cgfsng_ops static void cgfsng_finalize(struct cgroup_ops *ops) } } -/* cgroup-full:* is done, no need to create subdirs */ -static inline bool cg_mount_needs_subdirs(int cgroup_automount_type) -{ - switch (cgroup_automount_type) { - case LXC_AUTO_CGROUP_RO: - return true; - case LXC_AUTO_CGROUP_RW: - return true; - case LXC_AUTO_CGROUP_MIXED: - return true; - } - - return false; -} - -/* After $rootfs/sys/fs/container/controller/the/cg/path has been created, - * remount controller ro if needed and bindmount the cgroupfs onto - * control/the/cg/path. - */ -static int cg_legacy_mount_controllers(int cgroup_automount_type, struct hierarchy *h, - char *hierarchy_mnt, char *cgpath, - const char *container_cgroup) -{ - __do_free char *sourcepath = NULL; - int ret, remount_flags; - int flags = MS_BIND; - - if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) || - (cgroup_automount_type == LXC_AUTO_CGROUP_MIXED)) { - ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup", MS_BIND, NULL); - if (ret < 0) - return log_error_errno(-1, errno, "Failed to bind mount \"%s\" onto \"%s\"", - hierarchy_mnt, hierarchy_mnt); - - remount_flags = add_required_remount_flags(hierarchy_mnt, - hierarchy_mnt, - flags | MS_REMOUNT); - ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup", - remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY, - NULL); - if (ret < 0) - return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", hierarchy_mnt); - - INFO("Remounted %s read-only", hierarchy_mnt); - } - - sourcepath = make_cgroup_path(h, h->at_base, container_cgroup, NULL); - if (cgroup_automount_type == LXC_AUTO_CGROUP_RO) - flags |= MS_RDONLY; - - ret = mount(sourcepath, cgpath, "cgroup", flags, NULL); - if (ret < 0) - return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"%s\"", - h->controllers[0], cgpath); - INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath); - - if (flags & MS_RDONLY) { - remount_flags = add_required_remount_flags(sourcepath, cgpath, - flags | MS_REMOUNT); - ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL); - if (ret < 0) - return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", cgpath); - INFO("Remounted %s read-only", cgpath); - } - - INFO("Completed second stage cgroup automounts for \"%s\"", cgpath); - return 0; -} - /* __cgroupfs_mount * * Mount cgroup hierarchies directly without using bind-mounts. The main @@ -1894,7 +1825,7 @@ static int __cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h, { __do_close int fd_fs = -EBADF; unsigned int flags = 0; - char *fstype; + char *fstype = "cgroup2"; int ret; if (dfd_mnt_cgroupfs < 0) @@ -1910,10 +1841,8 @@ static int __cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h, (cgroup_automount_type == LXC_AUTO_CGROUP2_RO)) flags |= MOUNT_ATTR_RDONLY; - if (is_unified_hierarchy(h)) - fstype = "cgroup2"; - else - fstype = "cgroup"; + if (!is_unified_hierarchy(h)) + return ret_errno(EOPNOTSUPP); if (can_use_mount_api()) { fd_fs = fs_prepare(fstype, -EBADF, "", 0, 0); @@ -1970,26 +1899,6 @@ static inline int cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h, dfd_mnt_cgroupfs, hierarchy_mnt); } -static inline int cgroupfs_bind_mount(int cgroup_automount_type, struct hierarchy *h, - struct lxc_rootfs *rootfs, - int dfd_mnt_cgroupfs, - const char *hierarchy_mnt) -{ - switch (cgroup_automount_type) { - case LXC_AUTO_CGROUP_FULL_RO: - break; - case LXC_AUTO_CGROUP_FULL_RW: - break; - case LXC_AUTO_CGROUP_FULL_MIXED: - break; - default: - return 0; - } - - return __cgroupfs_mount(cgroup_automount_type, h, rootfs, - dfd_mnt_cgroupfs, hierarchy_mnt); -} - __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops, struct lxc_handler *handler, int cg_flags) { @@ -1999,7 +1908,6 @@ __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops, bool in_cgroup_ns = false, wants_force_mount = false; struct lxc_conf *conf = handler->conf; struct lxc_rootfs *rootfs = &conf->rootfs; - const char *rootfs_mnt = get_rootfs_mnt(rootfs); int ret; if (!ops) @@ -2143,93 +2051,7 @@ __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops, return syserror_ret(false, "Failed to mount cgroups"); } - /* - * Mount a tmpfs over DEFAULT_CGROUP_MOUNTPOINT. Note that we're - * relying on RESOLVE_BENEATH so we need to skip the leading "/" in the - * DEFAULT_CGROUP_MOUNTPOINT define. - */ - if (can_use_mount_api()) { - fd_fs = fs_prepare("tmpfs", -EBADF, "", 0, 0); - if (fd_fs < 0) - return log_error_errno(false, errno, "Failed to create new filesystem context for tmpfs"); - - ret = fs_set_property(fd_fs, "mode", "0755"); - if (ret < 0) - return log_error_errno(false, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs); - - ret = fs_set_property(fd_fs, "size", "10240k"); - if (ret < 0) - return log_error_errno(false, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs); - - ret = fs_attach(fd_fs, rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE, - PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, - MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | - MOUNT_ATTR_NOEXEC | MOUNT_ATTR_RELATIME); - } else { - cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL); - ret = safe_mount(NULL, cgroup_root, "tmpfs", - MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME, - "size=10240k,mode=755", rootfs_mnt); - } - if (ret < 0) - return log_error_errno(false, errno, "Failed to mount tmpfs on %s", - DEFAULT_CGROUP_MOUNTPOINT_RELATIVE); - - dfd_mnt_tmpfs = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE, - PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0); - if (dfd_mnt_tmpfs < 0) - return syserror_ret(false, "Failed to open %d(%s)", - rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE); - - for (int i = 0; ops->hierarchies[i]; i++) { - __do_free char *hierarchy_mnt = NULL, *path2 = NULL; - struct hierarchy *h = ops->hierarchies[i]; - - ret = mkdirat(dfd_mnt_tmpfs, h->at_mnt, 0000); - if (ret < 0) - return syserror_ret(false, "Failed to create cgroup at_mnt %d(%s)", dfd_mnt_tmpfs, h->at_mnt); - - if (in_cgroup_ns && wants_force_mount) { - /* - * If cgroup namespaces are supported but the container - * will not have CAP_SYS_ADMIN after it has started we - * need to mount the cgroups manually. - */ - ret = cgroupfs_mount(cgroup_automount_type, h, rootfs, - dfd_mnt_tmpfs, h->at_mnt); - if (ret < 0) - return false; - - continue; - } - - /* Here is where the ancient kernel section begins. */ - ret = cgroupfs_bind_mount(cgroup_automount_type, h, rootfs, - dfd_mnt_tmpfs, h->at_mnt); - if (ret < 0) - return false; - - if (!cg_mount_needs_subdirs(cgroup_automount_type)) - continue; - - if (!cgroup_root) - cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL); - - hierarchy_mnt = must_make_path(cgroup_root, h->at_mnt, NULL); - path2 = must_make_path(hierarchy_mnt, h->at_base, - ops->container_cgroup, NULL); - ret = lxc_mkdir_p(path2, 0755); - if (ret < 0 && (errno != EEXIST)) - return false; - - ret = cg_legacy_mount_controllers(cgroup_automount_type, h, - hierarchy_mnt, path2, - ops->container_cgroup); - if (ret < 0) - return false; - } - - return true; + return syserror_ret(false, "Failed to mount cgroups - unsupported cgroup layout"); } /* Only root needs to escape to the cgroup of its init. */ From 358a66022f4e15e544f30ac7303fdeb5225cd880 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 21 Apr 2026 17:09:07 +0200 Subject: [PATCH 05/29] lxc/conf: drop cgroup1 config options (lxc.cgroup.*) Signed-off-by: Alexander Mikhalitsyn --- src/lxc/conf.c | 7 ------- src/lxc/conf.h | 1 - src/lxc/confile.c | 35 +---------------------------------- 3 files changed, 1 insertion(+), 42 deletions(-) diff --git a/src/lxc/conf.c b/src/lxc/conf.c index 571b3fd203..947cbad66f 100644 --- a/src/lxc/conf.c +++ b/src/lxc/conf.c @@ -3208,7 +3208,6 @@ struct lxc_conf *lxc_conf_init(void) new->rootfs.fd_path_pin = -EBADF; new->rootfs.dfd_idmapped = -EBADF; new->logfd = -1; - INIT_LIST_HEAD(&new->cgroup); INIT_LIST_HEAD(&new->cgroup2); /* Block ("allowlist") all devices by default. */ new->bpf_devices.list_type = LXC_BPF_DEVICE_CGROUP_ALLOWLIST; @@ -4118,11 +4117,6 @@ int lxc_clear_cgroups(struct lxc_conf *c, const char *key, int version) namespaced_token = "lxc.cgroup2."; namespaced_token_len = STRLITERALLEN("lxc.cgroup2."); list = &c->cgroup2; - } else if (version == CGROUP_SUPER_MAGIC) { - global_token = "lxc.cgroup"; - namespaced_token = "lxc.cgroup."; - namespaced_token_len = STRLITERALLEN("lxc.cgroup."); - list = &c->cgroup; } else { return ret_errno(EINVAL); } @@ -4370,7 +4364,6 @@ void lxc_conf_free(struct lxc_conf *conf) free(conf->lsm_se_keyring_context); lxc_seccomp_free(&conf->seccomp); lxc_clear_config_caps(conf); - lxc_clear_cgroups(conf, "lxc.cgroup", CGROUP_SUPER_MAGIC); lxc_clear_cgroups(conf, "lxc.cgroup2", CGROUP2_SUPER_MAGIC); lxc_clear_cgroups_devices(conf); lxc_clear_hooks(conf, "lxc.hook"); diff --git a/src/lxc/conf.h b/src/lxc/conf.h index 762d58901d..ea4e199404 100644 --- a/src/lxc/conf.h +++ b/src/lxc/conf.h @@ -417,7 +417,6 @@ struct lxc_conf { struct utsname *utsname; struct { - struct list_head cgroup; struct list_head cgroup2; struct bpf_devices bpf_devices; }; diff --git a/src/lxc/confile.c b/src/lxc/confile.c index 8985f15b79..3d310a9b70 100644 --- a/src/lxc/confile.c +++ b/src/lxc/confile.c @@ -65,7 +65,6 @@ lxc_config_define(apparmor_profile); lxc_config_define(apparmor_raw); lxc_config_define(cap_drop); lxc_config_define(cap_keep); -lxc_config_define(cgroup_controller); lxc_config_define(cgroup2_controller); lxc_config_define(cgroup_dir); lxc_config_define(cgroup_monitor_dir); @@ -206,7 +205,6 @@ static struct lxc_config_t config_jump_table[] = { { "lxc.cgroup.dir.container", true, set_config_cgroup_container_dir, get_config_cgroup_container_dir, clr_config_cgroup_container_dir, }, { "lxc.cgroup.dir", true, set_config_cgroup_dir, get_config_cgroup_dir, clr_config_cgroup_dir, }, { "lxc.cgroup.relative", true, set_config_cgroup_relative, get_config_cgroup_relative, clr_config_cgroup_relative, }, - { "lxc.cgroup", false, set_config_cgroup_controller, get_config_cgroup_controller, clr_config_cgroup_controller, }, { "lxc.console.buffer.size", true, set_config_console_buffer_size, get_config_console_buffer_size, clr_config_console_buffer_size, }, { "lxc.console.logfile", true, set_config_console_logfile, get_config_console_logfile, clr_config_console_logfile, }, { "lxc.console.path", true, set_config_console_path, get_config_console_path, clr_config_console_path, }, @@ -1934,9 +1932,6 @@ static int __set_config_cgroup_controller(const char *key, const char *value, if (version == CGROUP2_SUPER_MAGIC) { token = "lxc.cgroup2."; token_len = 12; - } else if (version == CGROUP_SUPER_MAGIC) { - token = "lxc.cgroup."; - token_len = 11; } else { return ret_errno(EINVAL); } @@ -1962,22 +1957,12 @@ static int __set_config_cgroup_controller(const char *key, const char *value, new_cgroup->version = version; - if (version == CGROUP2_SUPER_MAGIC) - list_add_tail(&new_cgroup->head, &lxc_conf->cgroup2); - else - list_add_tail(&new_cgroup->head, &lxc_conf->cgroup); + list_add_tail(&new_cgroup->head, &lxc_conf->cgroup2); move_ptr(new_cgroup); return 0; } -static int set_config_cgroup_controller(const char *key, const char *value, - struct lxc_conf *lxc_conf, void *data) -{ - return __set_config_cgroup_controller(key, value, lxc_conf, - CGROUP_SUPER_MAGIC); -} - static int set_config_cgroup2_controller(const char *key, const char *value, struct lxc_conf *lxc_conf, void *data) { @@ -3903,11 +3888,6 @@ static int __get_config_cgroup_controller(const char *key, char *retv, namespaced_token = "lxc.cgroup2."; namespaced_token_len = STRLITERALLEN("lxc.cgroup2."); list = &c->cgroup2; - } else if (version == CGROUP_SUPER_MAGIC) { - global_token = "lxc.cgroup"; - namespaced_token = "lxc.cgroup."; - namespaced_token_len = STRLITERALLEN("lxc.cgroup."); - list = &c->cgroup; } else { return ret_errno(EINVAL); } @@ -3934,13 +3914,6 @@ static int __get_config_cgroup_controller(const char *key, char *retv, return fulllen; } -static int get_config_cgroup_controller(const char *key, char *retv, int inlen, - struct lxc_conf *c, void *data) -{ - return __get_config_cgroup_controller(key, retv, inlen, c, - CGROUP_SUPER_MAGIC); -} - static int get_config_cgroup2_controller(const char *key, char *retv, int inlen, struct lxc_conf *c, void *data) { @@ -4931,12 +4904,6 @@ static inline int clr_config_keyring_session(const char *key, return 0; } -static inline int clr_config_cgroup_controller(const char *key, - struct lxc_conf *c, void *data) -{ - return lxc_clear_cgroups(c, key, CGROUP_SUPER_MAGIC); -} - static inline int clr_config_cgroup2_controller(const char *key, struct lxc_conf *c, void *data) { From 2d51b77b1db6e23c43cedb958650a3b9907f6b93 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 21 Apr 2026 17:23:42 +0200 Subject: [PATCH 06/29] tests: use lxc.cgroup2 instead of lxc.cgroup Signed-off-by: Alexander Mikhalitsyn --- src/tests/get_item.c | 16 ++++++++-------- src/tests/lxc-test-checkpoint-restore | 2 +- src/tests/parse_config_file.c | 12 ++++++------ 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/tests/get_item.c b/src/tests/get_item.c index 40cc564adf..52559c1b2b 100644 --- a/src/tests/get_item.c +++ b/src/tests/get_item.c @@ -622,22 +622,22 @@ int main(int argc, char *argv[]) goto out; } - ret = c->get_config_item(c, "lxc.cgroup", v3, 2047); + ret = c->get_config_item(c, "lxc.cgroup2", v3, 2047); if (ret < 0) { - fprintf(stderr, "%d: get_config_item(cgroup.devices) returned %d\n", __LINE__, ret); + fprintf(stderr, "%d: get_config_item(cgroup2.devices) returned %d\n", __LINE__, ret); goto out; } - printf("%d: get_config_item (cgroup.devices) returned %d %s\n", __LINE__, ret, v3); + printf("%d: get_config_item (cgroup2.devices) returned %d %s\n", __LINE__, ret, v3); - ret = c->get_config_item(c, "lxc.cgroup.devices.allow", v3, 2047); + ret = c->get_config_item(c, "lxc.cgroup2.devices.allow", v3, 2047); if (ret < 0) { - fprintf(stderr, "%d: get_config_item(cgroup.devices.devices.allow) returned %d\n", __LINE__, ret); + fprintf(stderr, "%d: get_config_item(cgroup2.devices.devices.allow) returned %d\n", __LINE__, ret); goto out; } - printf("%d: get_config_item (cgroup.devices.devices.allow) returned %d %s\n", __LINE__, ret, v3); + printf("%d: get_config_item (cgroup2.devices.devices.allow) returned %d %s\n", __LINE__, ret, v3); - if (!c->clear_config_item(c, "lxc.cgroup")) { - fprintf(stderr, "%d: failed clearing lxc.cgroup\n", __LINE__); + if (!c->clear_config_item(c, "lxc.cgroup2")) { + fprintf(stderr, "%d: failed clearing lxc.cgroup2\n", __LINE__); goto out; } diff --git a/src/tests/lxc-test-checkpoint-restore b/src/tests/lxc-test-checkpoint-restore index 21f9b7c652..3a2853d644 100755 --- a/src/tests/lxc-test-checkpoint-restore +++ b/src/tests/lxc-test-checkpoint-restore @@ -42,7 +42,7 @@ cat >> "$(lxc-config lxc.lxcpath)/$name/config" <set_config_item(c, "lxc.cgroup.cpuset.cpus", "1-100")) { - lxc_error("%s\n", "failed to set config item \"lxc.cgroup.cpuset.cpus\" to \"1-100\""); + if (!c->set_config_item(c, "lxc.cgroup2.cpuset.cpus", "1-100")) { + lxc_error("%s\n", "failed to set config item \"lxc.cgroup2.cpuset.cpus\" to \"1-100\""); return -1; } - if (!c->set_config_item(c, "lxc.cgroup.memory.limit_in_bytes", "123456789")) { - lxc_error("%s\n", "failed to set config item \"lxc.cgroup.memory.limit_in_bytes\" to \"123456789\""); + if (!c->set_config_item(c, "lxc.cgroup2.memory.max", "123456789")) { + lxc_error("%s\n", "failed to set config item \"lxc.cgroup2.memory.max\" to \"123456789\""); return -1; } From 5ead0bb5d9b67bcc7cc4235474837f4e0dcd28e8 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 21 Apr 2026 17:37:17 +0200 Subject: [PATCH 07/29] config/templates: don't use cgroup1 settings Signed-off-by: Alexander Mikhalitsyn --- config/templates/common.conf.in | 29 ----------------------------- config/templates/userns.conf.in | 5 ----- 2 files changed, 34 deletions(-) diff --git a/config/templates/common.conf.in b/config/templates/common.conf.in index 7fb109e049..311fbd44e8 100644 --- a/config/templates/common.conf.in +++ b/config/templates/common.conf.in @@ -15,35 +15,6 @@ lxc.cap.drop = mac_admin mac_override sys_time sys_module sys_rawio # Ensure hostname is changed on clone lxc.hook.clone = @LXCHOOKDIR@/clonehostname -# Default legacy cgroup configuration -# -# CGroup allowlist -lxc.cgroup.devices.deny = a -## Allow any mknod (but not reading/writing the node) -lxc.cgroup.devices.allow = c *:* m -lxc.cgroup.devices.allow = b *:* m -## Allow specific devices -### /dev/null -lxc.cgroup.devices.allow = c 1:3 rwm -### /dev/zero -lxc.cgroup.devices.allow = c 1:5 rwm -### /dev/full -lxc.cgroup.devices.allow = c 1:7 rwm -### /dev/tty -lxc.cgroup.devices.allow = c 5:0 rwm -### /dev/console -lxc.cgroup.devices.allow = c 5:1 rwm -### /dev/ptmx -lxc.cgroup.devices.allow = c 5:2 rwm -### /dev/random -lxc.cgroup.devices.allow = c 1:8 rwm -### /dev/urandom -lxc.cgroup.devices.allow = c 1:9 rwm -### /dev/pts/* -lxc.cgroup.devices.allow = c 136:* rwm -### fuse -lxc.cgroup.devices.allow = c 10:229 rwm - # Default unified cgroup configuration # # CGroup allowlist diff --git a/config/templates/userns.conf.in b/config/templates/userns.conf.in index 255dd01a35..b45f601fbf 100644 --- a/config/templates/userns.conf.in +++ b/config/templates/userns.conf.in @@ -1,10 +1,5 @@ # CAP_SYS_ADMIN in init-user-ns is required for cgroup.devices # -# Default legacy cgroup configuration -# -lxc.cgroup.devices.deny = -lxc.cgroup.devices.allow = - # Default unified cgroup configuration # lxc.cgroup2.devices.deny = From 45c1dea3bba6411abdafc26b7bfa7f8d10815aa0 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 21 Apr 2026 17:39:10 +0200 Subject: [PATCH 08/29] lxc/cgroups: warn if non-unified cgroup layout detected Signed-off-by: Alexander Mikhalitsyn --- src/lxc/cgroups/cgroup.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/lxc/cgroups/cgroup.c b/src/lxc/cgroups/cgroup.c index 5e2a7d0993..b8029dade5 100644 --- a/src/lxc/cgroups/cgroup.c +++ b/src/lxc/cgroups/cgroup.c @@ -40,14 +40,11 @@ struct cgroup_ops *cgroup_init(struct lxc_conf *conf) TRACE("Initialized cgroup driver %s", cgroup_ops->driver); - if (cgroup_ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) - TRACE("Legacy cgroup layout"); - else if (cgroup_ops->cgroup_layout == CGROUP_LAYOUT_HYBRID) - TRACE("Hybrid cgroup layout"); - else if (cgroup_ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) + if (pure_unified_layout(cgroup_ops)) { TRACE("Unified cgroup layout"); - else - WARN("Unsupported cgroup layout"); + } else { + WARN("Unsupported cgroup layout (%s)", cgroup_layout_name(cgroup_ops->cgroup_layout)); + } return cgroup_ops; } From 89b4c188ad32409e4de06e85375e2d392fcc9334 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 21 Apr 2026 17:43:31 +0200 Subject: [PATCH 09/29] doc: mention that legacy/hybrid hierarchy support is dropped Signed-off-by: Alexander Mikhalitsyn --- doc/lxc.container.conf.sgml.in | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/lxc.container.conf.sgml.in b/doc/lxc.container.conf.sgml.in index 39efffbe56..ae58e5e3ba 100644 --- a/doc/lxc.container.conf.sgml.in +++ b/doc/lxc.container.conf.sgml.in @@ -1558,7 +1558,8 @@ ignore settings on systems that only use the unified hierarchy. Conversely, it will ignore options on systems that only use legacy - hierarchies. + hierarchies. (legacy and hybrid hierarchy) + support is dropped. From b0b65bac0a043c2c86589ad4b0b67b141ff61e08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Graber?= Date: Wed, 22 Apr 2026 14:05:20 -0400 Subject: [PATCH 10/29] Merge pull request #4671 from mihalicyn/remove_cgroup1 remove cgroup1 support From 555c80b49ca2c2f2987f2c9d13a3564d963805e6 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Wed, 22 Apr 2026 09:09:39 +0200 Subject: [PATCH 11/29] lxc/start: assume CLONE_PIDFD and clone3 are supported We agreed to set 6.12 as a Linux kernel requirement for LXC 7.x line, it was released in Nov 2024 [1]. Let's drop fallback code for cases when CLONE_PIDFD or clone3 are not supported. CLONE_PIDFD was added in 5.2 clone3 was added in 5.3 I decided to keep fallback logic for non-supported CLONE_INTO_CGROUP for now, while it was added in 5.7. Link: https://github.com/torvalds/linux/commit/adc218676eef25575469234709c2d87185ca223a [1] Signed-off-by: Alexander Mikhalitsyn --- src/lxc/start.c | 42 ++++++++++++++---------------------------- 1 file changed, 14 insertions(+), 28 deletions(-) diff --git a/src/lxc/start.c b/src/lxc/start.c index 7f269805e0..ddec8f7624 100644 --- a/src/lxc/start.c +++ b/src/lxc/start.c @@ -1774,8 +1774,17 @@ static inline int do_share_ns(void *arg) flags |= CLONE_PARENT; handler->pid = lxc_raw_clone_cb(do_start, handler, CLONE_PIDFD | flags, &handler->pidfd); - if (handler->pid < 0) + if (handler->pid < 0) { + ERROR("Failed to clone process"); return -1; + } + + if (handler->pidfd < 0) { + kill(handler->pid, SIGKILL); + handler->pid = -1; + ERROR("CLONE_PIDFD isn't supported"); + return -1; + } return 0; } @@ -1920,7 +1929,7 @@ static int lxc_spawn(struct lxc_handler *handler) /* Try to spawn directly into target cgroup. */ handler->pid = lxc_clone3(&clone_args, CLONE_ARGS_SIZE_VER2); if (handler->pid < 0) { - SYSTRACE("Failed to spawn container directly into target cgroup"); + SYSWARN("Failed to spawn container directly into target cgroup"); /* Kernel might simply be too old for CLONE_INTO_CGROUP. */ resolve_cgroup_clone_flags(handler); @@ -1931,31 +1940,6 @@ static int lxc_spawn(struct lxc_handler *handler) TRACE("Spawned container directly into target cgroup via cgroup2 fd %d", cgroup_fd); } - /* Kernel might be too old for clone3(). */ - if (handler->pid < 0) { - SYSTRACE("Failed to spawn container via clone3()"); - - /* - * In contrast to all other architectures arm64 verifies that - * the argument we use to retrieve the pidfd with is - * initialized to 0. But we need to be able to initialize it to - * a negative value such as our customary -EBADF so we can - * detect whether this kernel supports pidfds. If the syscall - * returns and the pidfd variable is set to something >= 0 then - * we know this is a kernel supporting pidfds. But if we can't - * set it to -EBADF then this won't work since 0 is a valid - * file descriptor too. And since legacy clone silently ignores - * unknown flags we are left without any way to detect support - * for pidfds. So let's special-case arm64 to not fail starting - * containers. - */ - #if defined(__aarch64__) - handler->pid = lxc_raw_legacy_clone(handler->clone_flags & ~CLONE_PIDFD, NULL); - #else - handler->pid = lxc_raw_legacy_clone(handler->clone_flags, &handler->pidfd); - #endif - } - if (handler->pid < 0) { SYSERROR(LXC_CLONE_ERROR); goto out_delete_net; @@ -1975,8 +1959,10 @@ static int lxc_spawn(struct lxc_handler *handler) goto out_delete_net; /* Verify that we can actually make use of pidfds. */ - if (!lxc_can_use_pidfd(handler->pidfd)) + if (!lxc_can_use_pidfd(handler->pidfd)) { close_prot_errno_disarm(handler->pidfd); + goto out_delete_net; + } ret = strnprintf(pidstr, 20, "%d", handler->pid); if (ret < 0) From d8e9d4da4aabfacc56598db7f97ae301d4603071 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Wed, 22 Apr 2026 09:33:18 +0200 Subject: [PATCH 12/29] lxc: assume fsopen/open_tree/mount_setattr syscalls are supported fsopen and open_tree were added in 5.2 mount_setattr in 5.12 Signed-off-by: Alexander Mikhalitsyn --- src/lxc/cgroups/cgfsng.c | 52 +++----- src/lxc/conf.c | 277 +++++++++++++-------------------------- 2 files changed, 104 insertions(+), 225 deletions(-) diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c index 6e37a6a068..17f499b98f 100644 --- a/src/lxc/cgroups/cgfsng.c +++ b/src/lxc/cgroups/cgfsng.c @@ -1844,44 +1844,24 @@ static int __cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h, if (!is_unified_hierarchy(h)) return ret_errno(EOPNOTSUPP); - if (can_use_mount_api()) { - fd_fs = fs_prepare(fstype, -EBADF, "", 0, 0); - if (fd_fs < 0) - return log_error_errno(-errno, errno, "Failed to prepare filesystem context for %s", fstype); - - if (!is_unified_hierarchy(h)) { - for (const char **it = (const char **)h->controllers; it && *it; it++) { - if (strnequal(*it, "name=", STRLITERALLEN("name="))) - ret = fs_set_property(fd_fs, "name", *it + STRLITERALLEN("name=")); - else - ret = fs_set_property(fd_fs, *it, ""); - if (ret < 0) - return log_error_errno(-errno, errno, "Failed to add %s controller to cgroup filesystem context %d(dev)", *it, fd_fs); - } - } - - ret = fs_attach(fd_fs, dfd_mnt_cgroupfs, hierarchy_mnt, - PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, - flags); - } else { - __do_free char *controllers = NULL, *target = NULL; - unsigned int old_flags = 0; - const char *rootfs_mnt; - - if (!is_unified_hierarchy(h)) { - controllers = lxc_string_join(",", (const char **)h->controllers, false); - if (!controllers) - return ret_errno(ENOMEM); + fd_fs = fs_prepare(fstype, -EBADF, "", 0, 0); + if (fd_fs < 0) + return log_error_errno(-errno, errno, "Failed to prepare filesystem context for %s", fstype); + + if (!is_unified_hierarchy(h)) { + for (const char **it = (const char **)h->controllers; it && *it; it++) { + if (strnequal(*it, "name=", STRLITERALLEN("name="))) + ret = fs_set_property(fd_fs, "name", *it + STRLITERALLEN("name=")); + else + ret = fs_set_property(fd_fs, *it, ""); + if (ret < 0) + return log_error_errno(-errno, errno, "Failed to add %s controller to cgroup filesystem context %d(dev)", *it, fd_fs); } - - rootfs_mnt = get_rootfs_mnt(rootfs); - ret = mnt_attributes_old(flags, &old_flags); - if (ret) - return log_error_errno(-EINVAL, EINVAL, "Unsupported mount properties specified"); - - target = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, hierarchy_mnt, NULL); - ret = safe_mount(NULL, target, fstype, old_flags, controllers, rootfs_mnt); } + + ret = fs_attach(fd_fs, dfd_mnt_cgroupfs, hierarchy_mnt, + PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, + flags); if (ret < 0) return log_error_errno(ret, errno, "Failed to mount %s filesystem onto %d(%s)", fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt)); diff --git a/src/lxc/conf.c b/src/lxc/conf.c index 947cbad66f..4d366d04b5 100644 --- a/src/lxc/conf.c +++ b/src/lxc/conf.c @@ -808,19 +808,16 @@ static int lxc_setup_ttys(struct lxc_conf *conf) "Failed to unlink %d(%s)", rootfs->dfd_dev, tty_name); - if (can_use_mount_api()) - ret = fd_bind_mount(tty->pty, "", - PROTECT_OPATH_FILE, - PROTECT_LOOKUP_BENEATH_XDEV, - fd_to, "", - PROTECT_OPATH_FILE, - PROTECT_LOOKUP_BENEATH_XDEV, - 0, - 0, - 0, - false); - else - ret = mount_fd(tty->pty, fd_to, "none", MS_BIND, 0); + ret = fd_bind_mount(tty->pty, "", + PROTECT_OPATH_FILE, + PROTECT_LOOKUP_BENEATH_XDEV, + fd_to, "", + PROTECT_OPATH_FILE, + PROTECT_LOOKUP_BENEATH_XDEV, + 0, + 0, + 0, + false); if (ret < 0) return log_error_errno(-errno, errno, "Failed to bind mount \"%s\" onto \"%s\"", @@ -845,19 +842,16 @@ static int lxc_setup_ttys(struct lxc_conf *conf) "Failed to create tty mount target %d(%s)", rootfs->dfd_dev, rootfs->buf); - if (can_use_mount_api()) - ret = fd_bind_mount(tty->pty, "", - PROTECT_OPATH_FILE, - PROTECT_LOOKUP_BENEATH_XDEV, - fd_to, "", - PROTECT_OPATH_FILE, - PROTECT_LOOKUP_BENEATH, - 0, - 0, - 0, - false); - else - ret = mount_fd(tty->pty, fd_to, "none", MS_BIND, 0); + ret = fd_bind_mount(tty->pty, "", + PROTECT_OPATH_FILE, + PROTECT_LOOKUP_BENEATH_XDEV, + fd_to, "", + PROTECT_OPATH_FILE, + PROTECT_LOOKUP_BENEATH, + 0, + 0, + 0, + false); if (ret < 0) return log_error_errno(-errno, errno, "Failed to bind mount \"%s\" onto \"%s\"", @@ -1017,37 +1011,23 @@ static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs, goto reset_umask; } - if (can_use_mount_api()) { - fd_fs = fs_prepare("tmpfs", -EBADF, "", 0, 0); - if (fd_fs < 0) - return log_error_errno(-errno, errno, "Failed to prepare filesystem context for tmpfs"); + fd_fs = fs_prepare("tmpfs", -EBADF, "", 0, 0); + if (fd_fs < 0) + return log_error_errno(-errno, errno, "Failed to prepare filesystem context for tmpfs"); - sprintf(mount_options, "%zu", tmpfs_size); + sprintf(mount_options, "%zu", tmpfs_size); - ret = fs_set_property(fd_fs, "mode", "0755"); - if (ret < 0) - return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs); - - ret = fs_set_property(fd_fs, "size", mount_options); - if (ret < 0) - return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs); - - ret = fs_attach(fd_fs, rootfs->dfd_mnt, "dev", - PROTECT_OPATH_DIRECTORY, - PROTECT_LOOKUP_BENEATH_XDEV, 0); - } else { - __do_free char *fallback_path = NULL; + ret = fs_set_property(fd_fs, "mode", "0755"); + if (ret < 0) + return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs); - sprintf(mount_options, "size=%zu,mode=755", tmpfs_size); - DEBUG("Using mount options: %s", mount_options); + ret = fs_set_property(fd_fs, "size", mount_options); + if (ret < 0) + return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs); - if (path) { - fallback_path = must_make_path(path, "/dev", NULL); - ret = safe_mount("none", fallback_path, "tmpfs", 0, mount_options, path); - } else { - ret = safe_mount("none", "dev", "tmpfs", 0, mount_options, NULL); - } - } + ret = fs_attach(fd_fs, rootfs->dfd_mnt, "dev", + PROTECT_OPATH_DIRECTORY, + PROTECT_LOOKUP_BENEATH_XDEV, 0); if (ret < 0) { SYSERROR("Failed to mount tmpfs on \"%s\"", path); goto reset_umask; @@ -1160,35 +1140,16 @@ static int lxc_fill_autodev(struct lxc_rootfs *rootfs) if (ret < 0) return ret_errno(EIO); - if (can_use_mount_api()) { - ret = fd_bind_mount(rootfs->dfd_host, rootfs->buf, - PROTECT_OPATH_FILE, - PROTECT_LOOKUP_BENEATH_XDEV, - rootfs->dfd_dev, device->name, - PROTECT_OPATH_FILE, - PROTECT_LOOKUP_BENEATH, - 0, - 0, - 0, - false); - } else { - char path[PATH_MAX]; - - ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "/dev/%s", device->name); - if (ret < 0) - return ret_errno(EIO); - - ret = strnprintf(path, sizeof(path), "%s/dev/%s", get_rootfs_mnt(rootfs), device->name); - if (ret < 0) - return log_error(-1, "Failed to create device path for %s", device->name); - - ret = safe_mount(rootfs->buf, path, 0, MS_BIND, NULL, get_rootfs_mnt(rootfs)); - if (ret < 0) - return log_error_errno(-1, errno, "Failed to bind mount host device node \"%s\" to \"%s\"", rootfs->buf, path); - - DEBUG("Bind mounted host device node \"%s\" to \"%s\"", rootfs->buf, path); - continue; - } + ret = fd_bind_mount(rootfs->dfd_host, rootfs->buf, + PROTECT_OPATH_FILE, + PROTECT_LOOKUP_BENEATH_XDEV, + rootfs->dfd_dev, device->name, + PROTECT_OPATH_FILE, + PROTECT_LOOKUP_BENEATH, + 0, + 0, + 0, + false); DEBUG("Bind mounted host device %d(%s) to %d(%s)", rootfs->dfd_host, rootfs->buf, rootfs->dfd_dev, device->name); } (void)umask(cmask); @@ -1496,104 +1457,48 @@ static int lxc_setup_devpts_child(struct lxc_handler *handler) if (ret < 0 && errno != EEXIST) return log_error_errno(-1, errno, "Failed to create \"/dev/pts\" directory"); - if (can_use_mount_api()) { - fd_fs = fs_prepare("devpts", -EBADF, "", 0, 0); - if (fd_fs < 0) - return syserror("Failed to prepare filesystem context for devpts"); - - ret = fs_set_property(fd_fs, "source", "devpts"); - if (ret < 0) - SYSTRACE("Failed to set \"source=devpts\" on devpts filesystem context %d", fd_fs); - - ret = fs_set_property(fd_fs, "gid", "5"); - if (ret < 0) - SYSTRACE("Failed to set \"gid=5\" on devpts filesystem context %d", fd_fs); - - ret = fs_set_flag(fd_fs, "newinstance"); - if (ret < 0) - return syserror("Failed to set \"newinstance\" property on devpts filesystem context %d", fd_fs); - - ret = fs_set_property(fd_fs, "ptmxmode", "0666"); - if (ret < 0) - return syserror("Failed to set \"ptmxmode=0666\" property on devpts filesystem context %d", fd_fs); - - ret = fs_set_property(fd_fs, "mode", "0620"); - if (ret < 0) - return syserror("Failed to set \"mode=0620\" property on devpts filesystem context %d", fd_fs); + fd_fs = fs_prepare("devpts", -EBADF, "", 0, 0); + if (fd_fs < 0) + return syserror("Failed to prepare filesystem context for devpts"); - ret = fs_set_property(fd_fs, "max", fdstr(pty_max)); - if (ret < 0) - return syserror("Failed to set \"max=%zu\" property on devpts filesystem context %d", conf->pty_max, fd_fs); - - ret = fsconfig(fd_fs, FSCONFIG_CMD_CREATE, NULL, NULL, 0); - if (ret < 0) - return syserror("Failed to finalize filesystem context %d", fd_fs); - - devpts_fd = fsmount(fd_fs, FSMOUNT_CLOEXEC, MOUNT_ATTR_NOSUID | MOUNT_ATTR_NOEXEC); - if (devpts_fd < 0) - return syserror("Failed to create new mount for filesystem context %d", fd_fs); - TRACE("Created detached devpts mount %d", devpts_fd); - - ret = move_mount(devpts_fd, "", rootfs->dfd_dev, "pts", MOVE_MOUNT_F_EMPTY_PATH); - if (ret) - return syserror("Failed to attach devpts mount %d to %d/pts", conf->devpts_fd, rootfs->dfd_dev); - - DEBUG("Attached detached devpts mount %d to %d/pts", devpts_fd, rootfs->dfd_dev); - } else { - char **opts; - char devpts_mntopts[256]; - char *mntopt_sets[5]; - char default_devpts_mntopts[256] = "gid=5,newinstance,ptmxmode=0666,mode=0620"; - - /* - * Fallback codepath in case the new mount API can't be used to - * create detached mounts. - */ - - ret = strnprintf(devpts_mntopts, sizeof(devpts_mntopts), "%s,max=%zu", - default_devpts_mntopts, pty_max); - if (ret < 0) - return -1; + ret = fs_set_property(fd_fs, "source", "devpts"); + if (ret < 0) + SYSTRACE("Failed to set \"source=devpts\" on devpts filesystem context %d", fd_fs); - /* Create mountpoint for devpts instance. */ - ret = mkdirat(rootfs->dfd_dev, "pts", 0755); - if (ret < 0 && errno != EEXIST) - return log_error_errno(-1, errno, "Failed to create \"/dev/pts\" directory"); + ret = fs_set_property(fd_fs, "gid", "5"); + if (ret < 0) + SYSTRACE("Failed to set \"gid=5\" on devpts filesystem context %d", fd_fs); - /* gid=5 && max= */ - mntopt_sets[0] = devpts_mntopts; + ret = fs_set_flag(fd_fs, "newinstance"); + if (ret < 0) + return syserror("Failed to set \"newinstance\" property on devpts filesystem context %d", fd_fs); - /* !gid=5 && max= */ - mntopt_sets[1] = devpts_mntopts + STRLITERALLEN("gid=5") + 1; + ret = fs_set_property(fd_fs, "ptmxmode", "0666"); + if (ret < 0) + return syserror("Failed to set \"ptmxmode=0666\" property on devpts filesystem context %d", fd_fs); - /* gid=5 && !max= */ - mntopt_sets[2] = default_devpts_mntopts; + ret = fs_set_property(fd_fs, "mode", "0620"); + if (ret < 0) + return syserror("Failed to set \"mode=0620\" property on devpts filesystem context %d", fd_fs); - /* !gid=5 && !max= */ - mntopt_sets[3] = default_devpts_mntopts + STRLITERALLEN("gid=5") + 1; + ret = fs_set_property(fd_fs, "max", fdstr(pty_max)); + if (ret < 0) + return syserror("Failed to set \"max=%zu\" property on devpts filesystem context %d", conf->pty_max, fd_fs); - /* end */ - mntopt_sets[4] = NULL; + ret = fsconfig(fd_fs, FSCONFIG_CMD_CREATE, NULL, NULL, 0); + if (ret < 0) + return syserror("Failed to finalize filesystem context %d", fd_fs); - for (ret = -1, opts = mntopt_sets; opts && *opts; opts++) { - /* mount new devpts instance */ - ret = mount_at(rootfs->dfd_dev, "", 0, - rootfs->dfd_dev, "pts", PROTECT_LOOKUP_BENEATH, - "devpts", MS_NOSUID | MS_NOEXEC, *opts); - if (ret == 0) - break; - } - if (ret < 0) - return log_error_errno(-1, errno, "Failed to mount new devpts instance"); + devpts_fd = fsmount(fd_fs, FSMOUNT_CLOEXEC, MOUNT_ATTR_NOSUID | MOUNT_ATTR_NOEXEC); + if (devpts_fd < 0) + return syserror("Failed to create new mount for filesystem context %d", fd_fs); + TRACE("Created detached devpts mount %d", devpts_fd); - devpts_fd = open_at(rootfs->dfd_dev, "pts", PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0); - if (devpts_fd < 0) { - devpts_fd = -EBADF; - TRACE("Failed to create detached devpts mount"); - } + ret = move_mount(devpts_fd, "", rootfs->dfd_dev, "pts", MOVE_MOUNT_F_EMPTY_PATH); + if (ret) + return syserror("Failed to attach devpts mount %d to %d/pts", conf->devpts_fd, rootfs->dfd_dev); - DEBUG("Mounted new devpts instance with options \"%s\"", *opts); - } + DEBUG("Attached detached devpts mount %d to %d/pts", devpts_fd, rootfs->dfd_dev); handler->conf->devpts_fd = move_fd(devpts_fd); @@ -1739,10 +1644,7 @@ static int bind_mount_console(int fd_devpts, struct lxc_rootfs *rootfs, * Note, there are intentionally no open or lookup restrictions since * we're operating directly on the fd. */ - if (can_use_mount_api()) - return fd_bind_mount(fd_pty, "", 0, 0, fd_to, "", 0, 0, 0, 0, 0, false); - - return mount_fd(fd_pty, fd_to, "none", MS_BIND, 0); + return fd_bind_mount(fd_pty, "", 0, 0, fd_to, "", 0, 0, 0, 0, 0, false); } static int lxc_setup_dev_console(int fd_devpts, struct lxc_rootfs *rootfs, @@ -1871,21 +1773,18 @@ static int lxc_setup_ttydir_console(int fd_devpts, struct lxc_rootfs *rootfs, return syserror("Failed to open \"%d/console\"", fd_ttydir); /* bind mount '/dev//console' to '/dev/console' */ - if (can_use_mount_api()) - ret = fd_bind_mount(fd_dev_console, - "", - PROTECT_OPATH_FILE, - PROTECT_LOOKUP_BENEATH_XDEV, - fd_reg_console, - "", - PROTECT_OPATH_FILE, - PROTECT_LOOKUP_BENEATH, - 0, - 0, - 0, - false); - else - ret = mount_fd(fd_dev_console, fd_reg_console, "none", MS_BIND, 0); + ret = fd_bind_mount(fd_dev_console, + "", + PROTECT_OPATH_FILE, + PROTECT_LOOKUP_BENEATH_XDEV, + fd_reg_console, + "", + PROTECT_OPATH_FILE, + PROTECT_LOOKUP_BENEATH, + 0, + 0, + 0, + false); if (ret < 0) return syserror("Failed to mount \"%d\" on \"%d\"", fd_dev_console, fd_reg_console); From 4380d21eb05833ba9878dc0d476e2ff48d1c109e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Graber?= Date: Wed, 22 Apr 2026 17:35:02 -0400 Subject: [PATCH 13/29] Merge pull request #4672 from mihalicyn/assume_new_enough_kernel assume CLONE_PIDFD, clone3, new mount api are supported From 2f60da605485d022b43b127123c08e6425a64ac6 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Thu, 25 Jul 2024 16:49:44 +0200 Subject: [PATCH 14/29] apparmor: allow nosymfollow remounts We need this for new versions of systemd, because it heavily uses MS_NOSYMFOLLOW these days. Signed-off-by: Alexander Mikhalitsyn --- src/lxc/lsm/apparmor.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lxc/lsm/apparmor.c b/src/lxc/lsm/apparmor.c index 9f31840ff7..1a53f3b1eb 100644 --- a/src/lxc/lsm/apparmor.c +++ b/src/lxc/lsm/apparmor.c @@ -689,6 +689,7 @@ static const struct mntopt_t { { ",nodev", sizeof(",nodev")-1 }, { ",nosuid", sizeof(",nosuid")-1 }, { ",noexec", sizeof(",noexec")-1 }, + { ",nosymfollow", sizeof(",nosymfollow")-1 }, }; static void append_remount_rule(char **profile, size_t *size, const char *rule) From 0153a7855b29d592ebe0f12a4b7635390b45aa14 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Thu, 25 Jul 2024 16:57:13 +0200 Subject: [PATCH 15/29] apparmor: allow nosymfollow remounts Signed-off-by: Alexander Mikhalitsyn --- config/apparmor/abstractions/container-base.in | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/config/apparmor/abstractions/container-base.in b/config/apparmor/abstractions/container-base.in index 87982fda3d..c7fc0d4cf7 100644 --- a/config/apparmor/abstractions/container-base.in +++ b/config/apparmor/abstractions/container-base.in @@ -128,6 +128,24 @@ mount options=(ro,remount,bind,noexec,nodev), mount options=(ro,remount,bind,nodev,nosuid), mount options=(ro,remount,bind,nosuid,noexec,nodev), + mount options=(ro,remount,bind,noatime), + mount options=(ro,remount,bind,noatime,nodev), + mount options=(ro,remount,bind,noatime,noexec), + mount options=(ro,remount,bind,noatime,nosuid), + mount options=(ro,remount,bind,noatime,noexec,nodev), + mount options=(ro,remount,bind,noatime,nosuid,nodev), + mount options=(ro,remount,bind,noatime,nosuid,noexec), + mount options=(ro,remount,bind,noatime,nosuid,noexec,nodev), + mount options=(ro,remount,bind,nosuid,noexec,strictatime), + mount options=(ro,remount,nosuid,noexec,strictatime), + mount options=(ro,remount,bind,nosymfollow), + mount options=(ro,remount,bind,nosymfollow,nodev), + mount options=(ro,remount,bind,nosymfollow,noexec), + mount options=(ro,remount,bind,nosymfollow,nosuid), + mount options=(ro,remount,bind,nosymfollow,noexec,nodev), + mount options=(ro,remount,bind,nosymfollow,nosuid,nodev), + mount options=(ro,remount,bind,nosymfollow,nosuid,noexec), + mount options=(ro,remount,bind,nosymfollow,nosuid,noexec,nodev), # allow moving mounts except for /proc, /sys and /dev mount options=(rw,move) /[^spd]*{,/**}, From 116abab180e50d06d5b048888990cfff6449444e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Graber?= Date: Thu, 23 Apr 2026 10:03:00 -0400 Subject: [PATCH 16/29] Merge pull request #4466 from mihalicyn/apparmor_nosymfollow apparmor: allow nosymfollow remounts From f1540aaf0d290d057512369d64eeb2a9c0159445 Mon Sep 17 00:00:00 2001 From: Fernando Picazo Date: Fri, 5 Sep 2025 22:33:36 -0500 Subject: [PATCH 17/29] lsm/apparmor: allow binfmt_misc RW mounts Signed-off-by: Fernando Picazo [ alex: fully reworked to match logic in Incus ] Signed-off-by: Alexander Mikhalitsyn --- src/lxc/lsm/apparmor.c | 44 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/src/lxc/lsm/apparmor.c b/src/lxc/lsm/apparmor.c index 1a53f3b1eb..256f8c12cb 100644 --- a/src/lxc/lsm/apparmor.c +++ b/src/lxc/lsm/apparmor.c @@ -84,9 +84,23 @@ static const char AA_PROFILE_BASE[] = " # deny access under /proc/bus to avoid e.g. messing with pci devices directly\n" " deny @{PROC}/bus/** wklx,\n" "\n" -" # deny writes in /proc/sys/fs but allow binfmt_misc to be mounted\n" +" # allow binfmt_misc to be mounted\n" " mount fstype=binfmt_misc -> /proc/sys/fs/binfmt_misc/,\n" -" deny @{PROC}/sys/fs/** wklx,\n" +"\n" +" # deny writes in /proc/sys/fs except /proc/sys/fs/binfmt_misc\n" +" deny @{PROC}/sys/fs/[^b]*{,/**} wklx,\n" +" deny @{PROC}/sys/fs/b[^i]*{,/**} wklx,\n" +" deny @{PROC}/sys/fs/bi[^n]*{,/**} wklx,\n" +" deny @{PROC}/sys/fs/bin[^f]*{,/**} wklx,\n" +" deny @{PROC}/sys/fs/binf[^m]*{,/**} wklx,\n" +" deny @{PROC}/sys/fs/binfm[^t]*{,/**} wklx,\n" +" deny @{PROC}/sys/fs/binfmt[^_]*{,/**} wklx,\n" +" deny @{PROC}/sys/fs/binfmt_[^m]*{,/**} wklx,\n" +" deny @{PROC}/sys/fs/binfmt_m[^i]*{,/**} wklx,\n" +" deny @{PROC}/sys/fs/binfmt_mi[^s]*{,/**} wklx,\n" +" deny @{PROC}/sys/fs/binfmt_mis[^c]*{,/**} wklx,\n" +" deny @{PROC}/sys/fs/binfmt_misc?*{,/**} wklx,\n" +" deny @{PROC}/sys/fs?*{,/**} wklx,\n" "\n" " # allow efivars to be mounted, writing to it will be blocked though\n" " mount fstype=efivarfs -> /sys/firmware/efi/efivars/,\n" @@ -172,10 +186,28 @@ static const char AA_PROFILE_BASE[] = " mount options=(rw,move) /sys?*{,/**},\n" "\n"; +static const char AA_PROFILE_BASE_PRIVILEGED[] = +" deny /proc/sys/fs/binfmt_misc/{,**} wklx,\n" +"\n"; + static const char AA_PROFILE_BASE_NO_NESTING[] = "\n" " # generated by: lxc-generate-aa-rules.py container-rules.base\n" -" deny /proc/sys/[^kn]*{,/**} wklx,\n" +" deny /proc/sys/[^fkn]*{,/**} wklx,\n" +" deny /proc/sys/f[^s]*{,/**} wklx,\n" +" deny /proc/sys/fs/[^b]*{,/**} wklx,\n" +" deny /proc/sys/fs/b[^i]*{,/**} wklx,\n" +" deny /proc/sys/fs/bi[^n]*{,/**} wklx,\n" +" deny /proc/sys/fs/bin[^f]*{,/**} wklx,\n" +" deny /proc/sys/fs/binf[^m]*{,/**} wklx,\n" +" deny /proc/sys/fs/binfm[^t]*{,/**} wklx,\n" +" deny /proc/sys/fs/binfmt[^_]*{,/**} wklx,\n" +" deny /proc/sys/fs/binfmt_[^m]*{,/**} wklx,\n" +" deny /proc/sys/fs/binfmt_m[^i]*{,/**} wklx,\n" +" deny /proc/sys/fs/binfmt_mi[^s]*{,/**} wklx,\n" +" deny /proc/sys/fs/binfmt_mis[^c]*{,/**} wklx,\n" +" deny /proc/sys/fs/binfmt_misc?*{,/**} wklx,\n" +" deny /proc/sys/fs?*{,/**} wklx,\n" " deny /proc/sys/k[^e]*{,/**} wklx,\n" " deny /proc/sys/ke[^r]*{,/**} wklx,\n" " deny /proc/sys/ker[^n]*{,/**} wklx,\n" @@ -338,6 +370,8 @@ static const char AA_PROFILE_UNPRIVILEGED[] = " ### Configuration: unprivileged container\n" " pivot_root,\n" "\n" +" mount fstype=binfmt_misc,\n" +"\n" " # Allow modifying mount propagation\n" " mount options=(rw,make-slave) -> /{,**},\n" " mount options=(rw,make-rslave) -> /{,**},\n" @@ -759,6 +793,10 @@ static char *get_apparmor_profile_content(struct lsm_ops *ops, struct lxc_conf * must_append_sized(&profile, &size, AA_PROFILE_BASE, STRARRAYLEN(AA_PROFILE_BASE)); + if (is_privileged(conf)) + must_append_sized(&profile, &size, AA_PROFILE_BASE_PRIVILEGED, + STRARRAYLEN(AA_PROFILE_BASE_PRIVILEGED)); + if (!conf->lsm_aa_allow_nesting) must_append_sized(&profile, &size, AA_PROFILE_BASE_NO_NESTING, STRARRAYLEN(AA_PROFILE_BASE_NO_NESTING)); From 282b60e60fccfd91c5e60325164ad2d0dfd6f352 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Graber?= Date: Thu, 23 Apr 2026 11:34:19 -0400 Subject: [PATCH 18/29] Merge pull request #4673 from mihalicyn/binfmt_rw_mount lsm/apparmor: allow binfmt_misc RW mounts From 8c92f4347114674c0889005ff2240503e9d35b1e Mon Sep 17 00:00:00 2001 From: Mathias Gibbens Date: Sat, 25 Apr 2026 20:11:53 +0000 Subject: [PATCH 19/29] tests/lxc-test-lxc-attach: Increase sleep time On riscv64 architectures, a single second sleep doesn't appear to be sufficient to work around the busybox pipe closure bug, and the test hangs forever. Increase to three seconds. Signed-off-by: Mathias Gibbens --- src/tests/lxc-test-lxc-attach | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/lxc-test-lxc-attach b/src/tests/lxc-test-lxc-attach index 720545f994..75eaa8a92d 100755 --- a/src/tests/lxc-test-lxc-attach +++ b/src/tests/lxc-test-lxc-attach @@ -221,7 +221,7 @@ busybox tee --help >/dev/null 2>&1 || FAIL "missing busybox's tee applet" out=$(mktemp /tmp/out_XXXX) BS=1000000 -( sleep 3; echo "echo DATASTART ; dd if=/dev/urandom bs=$BS count=1 status=none | hexdump | tee /root/large-data.txt ; echo DATAEND" ; sleep 1 ) | \ +( sleep 3; echo "echo DATASTART ; dd if=/dev/urandom bs=$BS count=1 status=none | hexdump | tee /root/large-data.txt ; echo DATAEND" ; sleep 3 ) | \ script -q -e -c "lxc-attach -n busy -l trace -o \"${ATTACH_LOG}\"" | \ sed -n '/DATASTART/,/DATAEND/{/DATASTART/d;/DATAEND/d;s/[\r\n]*$//;p}' > $out From 16f6c890332be3fe97c51c5c91a07634c79ae283 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Graber?= Date: Sat, 25 Apr 2026 16:28:41 -0400 Subject: [PATCH 20/29] Merge pull request #4674 from gibmat/extend-test-sleep-riscv64 tests/lxc-test-lxc-attach: Increase sleep time From 071cd4c60588d5a36e9dac4b820d3e858a3a7512 Mon Sep 17 00:00:00 2001 From: Serge Hallyn Date: Sun, 26 Apr 2026 22:15:41 +0200 Subject: [PATCH 21/29] lvm.c: make sure tp gets freed tp is __do_free. However, when we detect that it is not a thinpool, we set it to NULL, so that it can't get freed on exit. coverity id 1461741 Signed-off-by: Serge Hallyn --- src/lxc/storage/lvm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lxc/storage/lvm.c b/src/lxc/storage/lvm.c index d563144518..dbc1ccf9fa 100644 --- a/src/lxc/storage/lvm.c +++ b/src/lxc/storage/lvm.c @@ -137,6 +137,7 @@ static int do_lvm_create(const char *path, uint64_t size, const char *thinpool) return log_error(-EINVAL, "Failed to detect whether \"%s\" is a thinpool", tp); } else if (!ret) { TRACE("Detected that \"%s\" is not a thinpool", tp); + free(tp); tp = NULL; } else { TRACE("Detected \"%s\" is a thinpool", tp); From 7b1a5eab2da1dc5cfc80e7c11c876155d15a4e60 Mon Sep 17 00:00:00 2001 From: Serge Hallyn Date: Sun, 26 Apr 2026 22:27:59 +0200 Subject: [PATCH 22/29] Don't leak an open fd The dfd_idmapped was being dup'd, but not freed. If we ever change it so that storage_put closes the dfd_idmapped fd, then we'll want to un-do this. For now, this is a kludgy way to avoid leaking the open fd, but should work. The new_rootfs->dfd_idmapped gets dup'd from c->lxc_conf->rootfs.dfd_idmapped. new_rootfs eventually gets assigned to new->rootfs (where new is a struct storage, usually called 'bdev'). From here there are error paths which free the bdev and return NULL, and a success path that returns bdev. But neither the error path nor the caller do anything really with the bdev, and storage_put() doesn't close that fd. So close the dfd_idmapped in both paths. Coverity id: 1641426 Signed-off-by: Serge Hallyn --- src/lxc/storage/storage.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/lxc/storage/storage.c b/src/lxc/storage/storage.c index 497cad7882..2490d3c333 100644 --- a/src/lxc/storage/storage.c +++ b/src/lxc/storage/storage.c @@ -516,6 +516,8 @@ struct lxc_storage *storage_copy(struct lxc_container *c, const char *cname, } on_success: + /* The only caller, copy_storage, doesn't ever close this. */ + close_prot_errno_disarm(new_rootfs.dfd_idmapped); lxc_storage_put(c->lxc_conf); return new; @@ -524,6 +526,7 @@ struct lxc_storage *storage_copy(struct lxc_container *c, const char *cname, storage_put(new); on_error_put_orig: + close_prot_errno_disarm(new_rootfs.dfd_idmapped); lxc_storage_put(c->lxc_conf); return NULL; From da651f1d3c8ec8c6e73fd8ca1dccf4e8fcb5d20f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Graber?= Date: Tue, 28 Apr 2026 10:11:56 -0400 Subject: [PATCH 23/29] Merge pull request #4677 from hallyn/2026-04-28/leakfd Don't leak an open fd From 35faadf2c82cb7b445d4e7087b45accd74b5b3f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Graber?= Date: Tue, 28 Apr 2026 10:12:26 -0400 Subject: [PATCH 24/29] Merge pull request #4676 from hallyn/2026-04-28/tp lvm.c: make sure tp gets freed From 1974ca441b69cf4499c20575a61ec80813c2b0e9 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Mon, 20 Apr 2026 23:07:47 -0500 Subject: [PATCH 25/29] lxc-user-nic: clarify and fix Some variable names were a bit confusing in find_line and cull_entries. Rename and document, and fix the flows using these. It's possible that a more maintainable approach, long term, would be to break these up differently: have one function create a neat in memory data structure representing the files, and have the paths currently using find_line and cull_entries peek into the data structures. But i think this is pretty clear. This fixes CVE-2026-39402 Signed-off-by: Serge E. Hallyn Reviewed-by: Alexander Mikhalitsyn --- src/lxc/cmd/lxc_user_nic.c | 75 +++++++++++++++++++++++++++++--------- 1 file changed, 57 insertions(+), 18 deletions(-) diff --git a/src/lxc/cmd/lxc_user_nic.c b/src/lxc/cmd/lxc_user_nic.c index 98aedf8216..83fd84a185 100644 --- a/src/lxc/cmd/lxc_user_nic.c +++ b/src/lxc/cmd/lxc_user_nic.c @@ -374,19 +374,58 @@ static char *get_eow(char *s, char *e) return s; } +static bool same_word(const char *start, const char *end, const char *word) +{ + size_t wordlen = strlen(word); + size_t buflen = end - start; + + if (wordlen != buflen) + return false; + if (strncmp(start, word, wordlen) == 0) + return true; + return false; +} + +/* + * in: + * @buf_start and @buf_end point to the buffer to be read. + * + * @owner_name is the name of the user who should own the link. + * + * @net_type is type of connection, e.g. veth + * + * @net_link is the name of the bridge, e.g. lxcbr0, on which the + * device should live. + * + * @net_dev is the name of the device itself in the host netns. + * + * out: + * @is_owner is set to true if the current line is owned by @name. + + * @nic_found is set to true if the line is specifically for the passed-in + * @net_dev, and it is on the right @net_link and of the right @net_type. + * + * @exists is set to false if the nic in this line no longer exists. This is + * used by cull_entries(): if we set it to false, then this line will be + * removed from the LXC_USERNIC_DB (e.g. /var/run/lxc/nics). + */ static char *find_line(char *buf_start, char *buf_end, char *name, char *net_type, char *net_link, char *net_dev, - bool *owner, bool *found, bool *keep) + bool *is_owner, bool *nic_found, bool *exists) { char *end_of_line, *end_of_word, *line; + bool right_net_type, right_bridge, right_link_name;; while (buf_start < buf_end) { size_t len; char netdev_name[IFNAMSIZ]; - *found = false; - *keep = true; - *owner = false; + *nic_found = false; + *exists = true; + *is_owner = false; + right_net_type = false; + right_bridge = false; + right_link_name = false; end_of_line = get_eol(buf_start, buf_end); if (end_of_line >= buf_end) @@ -405,11 +444,8 @@ static char *find_line(char *buf_start, char *buf_end, char *name, if (!end_of_word) return NULL; - if (strncmp(buf_start, name, strlen(name))) - *found = false; - else - if (strlen(name) == (size_t)(end_of_word - buf_start)) - *owner = true; + if (same_word(buf_start, end_of_word, name)) + *is_owner = true; buf_start = end_of_word + 1; while ((buf_start < buf_end) && isblank(*buf_start)) @@ -421,8 +457,8 @@ static char *find_line(char *buf_start, char *buf_end, char *name, if (!end_of_word) return NULL; - if (strncmp(buf_start, net_type, strlen(net_type))) - *found = false; + if (same_word(buf_start, end_of_word, net_type)) + right_net_type = true; buf_start = end_of_word + 1; while ((buf_start < buf_end) && isblank(*buf_start)) @@ -434,8 +470,8 @@ static char *find_line(char *buf_start, char *buf_end, char *name, if (!end_of_word) return NULL; - if (strncmp(buf_start, net_link, strlen(net_link))) - *found = false; + if (same_word(buf_start, end_of_word, net_link)) + right_bridge = true; buf_start = end_of_word + 1; while ((buf_start < buf_end) && isblank(*buf_start)) @@ -454,10 +490,13 @@ static char *find_line(char *buf_start, char *buf_end, char *name, memcpy(netdev_name, buf_start, len); netdev_name[len] = '\0'; - *keep = lxc_nic_exists(netdev_name); + *exists = lxc_nic_exists(netdev_name); if (net_dev && !strcmp(netdev_name, net_dev)) - *found = true; + right_link_name = true; + + if (right_net_type && right_bridge && right_link_name) + *nic_found = true; return line; @@ -587,7 +626,7 @@ static bool cull_entries(int fd, char *name, char *net_type, char *net_link, size_t length = 0; int ret; char *buf_end, *buf_start; - bool found, keep; + bool nic_found, is_owner, keep; ret = fd_to_buf(fd, &buf, &length); if (ret < 0) { @@ -603,7 +642,7 @@ static bool cull_entries(int fd, char *name, char *net_type, char *net_link, buf_start = buf; buf_end = buf + length; while ((buf_start = find_line(buf_start, buf_end, name, net_type, - net_link, net_dev, &(bool){true}, &found, + net_link, net_dev, &is_owner, &nic_found, &keep))) { struct entry_line *newe; @@ -611,7 +650,7 @@ static bool cull_entries(int fd, char *name, char *net_type, char *net_link, if (!newe) return false; - if (found) + if (nic_found && is_owner) *found_nicname = true; entry_lines = newe; From 20acae8e8fc8792f314fd276ebab29eba1206ee0 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Mon, 20 Apr 2026 23:08:17 -0500 Subject: [PATCH 26/29] usernic: add a test for ovs port deletion permission Signed-off-by: Serge E. Hallyn Reviewed-by: Alexander Mikhalitsyn --- src/tests/lxc-test-usernic-2.in | 129 ++++++++++++++++++++++++++++++++ src/tests/meson.build | 7 ++ 2 files changed, 136 insertions(+) create mode 100755 src/tests/lxc-test-usernic-2.in diff --git a/src/tests/lxc-test-usernic-2.in b/src/tests/lxc-test-usernic-2.in new file mode 100755 index 0000000000..567ac6e5a2 --- /dev/null +++ b/src/tests/lxc-test-usernic-2.in @@ -0,0 +1,129 @@ +#!/bin/bash + +# SPDX-License-Identifier: LGPL-2.1+ + +# lxc: linux Container library +# +# This tests verifies that lxc-user-nic can't be used by an +# unprivileged user to delete another user's ovs nics. +# +# This test assumes an Ubuntu host + +DONE=0 +LXC_USER_NIC="@LIBEXECDIR@/lxc/lxc-user-nic" + +apt-get -y install openvswitch-switch + +run_cmd() { + u=$1 + shift + sudo -i -u $u \ + env http_proxy=${http_proxy:-} https_proxy=${https_proxy:-} \ + XDG_RUNTIME_DIR=/run/user/$(id -u $u) ASAN_OPTIONS=${ASAN_OPTIONS:-} \ + UBSAN_OPTIONS=${UBSAN_OPTIONS:-} $* +} + +cleanup() { + set +e + + ( + run_cmd usernic-first "lxc-stop -n b1 -k" + run_cmd usernic-second "lxc-stop -n b1 -k" + sed -i '/usernic-first/d' /run/lxc/nics /etc/lxc/lxc-usernet + sed -i '/usernic-second/d' /run/lxc/nics /etc/lxc/lxc-usernet + ovs-vsctl del-br usernic-vs + + pkill -u $(id -u usernic-first) -9 + pkill -u $(id -u usernic-second) -9 + + rm -rf /tmp/usernic-test + rm -rf /home/usernic-first /run/user/$(id -u usernic-first) + rm -rf /home/usernic-second /run/user/$(id -u usernic-second) + + deluser usernic-first + deluser usernic-second + ) >/dev/null 2>&1 + + if [ "$DONE" = "1" ]; then + echo "PASS" + exit 0 + fi + + echo "FAIL" + exit 1 +} + +set -eux +trap cleanup EXIT SIGHUP SIGINT SIGTERM + +# create a test user +deluser usernic-first || true +useradd usernic-first +mkdir -p /home/usernic-first +chown usernic-first: /home/usernic-first +usermod -v 910000-919999 -w 910000-919999 usernic-first + +mkdir -p /home/usernic-first/.config/lxc/ +cat > /home/usernic-first/.config/lxc/default.conf << EOF +lxc.net.0.type = veth +lxc.net.0.link = usernic-vs +lxc.net.0.flags = up +lxc.idmap = u 0 910000 10000 +lxc.idmap = g 0 910000 10000 +EOF + +deluser usernic-second || true +useradd usernic-second +mkdir -p /home/usernic-second +chown usernic-second: /home/usernic-second +usermod -v 920000-929999 -w 920000-929999 usernic-second + +mkdir -p /home/usernic-second/.config/lxc/ +cat > /home/usernic-second/.config/lxc/default.conf << EOF +lxc.net.0.type = veth +lxc.net.0.link = usernic-vs +lxc.net.0.flags = up +lxc.idmap = u 0 920000 10000 +lxc.idmap = g 0 920000 10000 +lxc.apparmor.profile = lxc-container-default-with-nesting +EOF + +mkdir -p /run/user/$(id -u usernic-first) /run/user/$(id -u usernic-second) +chown -R usernic-first: /run/user/$(id -u usernic-first) /home/usernic-first +chown -R usernic-second: /run/user/$(id -u usernic-second) /home/usernic-second + +ovs-vsctl add-br usernic-vs + +# Give each a quota of one nic on this bridge +touch /etc/lxc/lxc-usernet +sed -i '/^usernic-first/d' /etc/lxc/lxc-usernet +sed -i '/^usernic-second/d' /etc/lxc/lxc-usernet +echo "usernic-second veth usernic-vs 1" >> /etc/lxc/lxc-usernet +echo "usernic-first veth usernic-vs 1" >> /etc/lxc/lxc-usernet + +run_cmd usernic-first "lxc-create -t busybox -n b1" +run_cmd usernic-first "lxc-start -n b1 -d" +run_cmd usernic-first "lxc-wait -n b1 -s RUNNING" +p1=$(run_cmd usernic-first "lxc-info -n b1 -p -H") + +run_cmd usernic-second "lxc-create -t busybox -n b1" +run_cmd usernic-second "lxc-start -n b1 -d" +run_cmd usernic-second "lxc-wait -n b1 -s RUNNING" +p2=$(run_cmd usernic-second "lxc-info -n b1 -p -H") + +ovs-vsctl list-ports usernic-vs +n1=$(ovs-vsctl list-ports usernic-vs | wc -l) +if [[ $n1 -ne 2 ]]; then + echo "wrong number of nics" + cleanup 1 +fi + +dev=$(grep usernic-first /run/lxc/nics | cut -f 4 -d\ ) +if run_cmd usernic-second \ + "$LXC_USER_NIC delete xx xx /proc/$p2/ns/net veth usernic-vs $dev"; then + echo "FAIL: unpriv user could unlink another user's ovs port" + cleanup 1 +fi + +echo "All tests passed" +DONE=1 diff --git a/src/tests/meson.build b/src/tests/meson.build index 2b997b73f5..5fc7147fce 100644 --- a/src/tests/meson.build +++ b/src/tests/meson.build @@ -401,6 +401,13 @@ if want_tests input: 'lxc-test-usernic.in', output: 'lxc-test-usernic') + test_programs += configure_file( + configuration: conf, + install: true, + install_dir: bindir, + input: 'lxc-test-usernic-2.in', + output: 'lxc-test-usernic-2') + test_programs += configure_file( configuration: dummy_config_data, install: true, From 5c26ff09ccc48d5f9704aa38ede38afd77c640d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Graber?= Date: Wed, 29 Apr 2026 18:15:10 -0400 Subject: [PATCH 27/29] Merge pull request #4678 from stgraber/security Fix security issue with lxc-user-nic and OpenVswitch networks From 623163302f427e7722e2c459b27a7b80cd092624 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Graber?= Date: Thu, 30 Apr 2026 00:16:55 +0200 Subject: [PATCH 28/29] Release LXC 7.0.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Stéphane Graber --- meson.build | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/meson.build b/meson.build index ebbd560053..47f75e1b97 100644 --- a/meson.build +++ b/meson.build @@ -4,7 +4,7 @@ project( 'lxc', 'c', - version: '6.0.0', + version: '7.0.0', license: 'LGPLv2+', default_options: [ 'b_lto=true', @@ -26,14 +26,14 @@ liblxc_dependencies = [] oss_fuzz_dependencies = [] # Version. -liblxc_version = '1.8.0' +liblxc_version = '1.9.0' version_data = configuration_data() -version_data.set('LXC_VERSION_MAJOR', '6') +version_data.set('LXC_VERSION_MAJOR', '7') version_data.set('LXC_VERSION_MINOR', '0') version_data.set('LXC_VERSION_MICRO', '0') version_data.set('LXC_VERSION_BETA', '') version_data.set('LXC_ABI', liblxc_version) -version_data.set('LXC_DEVEL', '1') +version_data.set('LXC_DEVEL', '0') version_data.set('LXC_VERSION', meson.project_version()) # Path handling. From d787c1aa0d2547e9e9691663dbaef4282c4e7217 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Graber?= Date: Tue, 2 Apr 2024 23:34:20 -0400 Subject: [PATCH 29/29] meson: Set DEVEL flag post release MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Stéphane Graber --- meson.build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meson.build b/meson.build index 47f75e1b97..2098dbfe74 100644 --- a/meson.build +++ b/meson.build @@ -33,7 +33,7 @@ version_data.set('LXC_VERSION_MINOR', '0') version_data.set('LXC_VERSION_MICRO', '0') version_data.set('LXC_VERSION_BETA', '') version_data.set('LXC_ABI', liblxc_version) -version_data.set('LXC_DEVEL', '0') +version_data.set('LXC_DEVEL', '1') version_data.set('LXC_VERSION', meson.project_version()) # Path handling.