diff --git a/config/apparmor/abstractions/container-base.in b/config/apparmor/abstractions/container-base.in
index 87982fda3d..c7fc0d4cf7 100644
--- a/config/apparmor/abstractions/container-base.in
+++ b/config/apparmor/abstractions/container-base.in
@@ -128,6 +128,24 @@
mount options=(ro,remount,bind,noexec,nodev),
mount options=(ro,remount,bind,nodev,nosuid),
mount options=(ro,remount,bind,nosuid,noexec,nodev),
+ mount options=(ro,remount,bind,noatime),
+ mount options=(ro,remount,bind,noatime,nodev),
+ mount options=(ro,remount,bind,noatime,noexec),
+ mount options=(ro,remount,bind,noatime,nosuid),
+ mount options=(ro,remount,bind,noatime,noexec,nodev),
+ mount options=(ro,remount,bind,noatime,nosuid,nodev),
+ mount options=(ro,remount,bind,noatime,nosuid,noexec),
+ mount options=(ro,remount,bind,noatime,nosuid,noexec,nodev),
+ mount options=(ro,remount,bind,nosuid,noexec,strictatime),
+ mount options=(ro,remount,nosuid,noexec,strictatime),
+ mount options=(ro,remount,bind,nosymfollow),
+ mount options=(ro,remount,bind,nosymfollow,nodev),
+ mount options=(ro,remount,bind,nosymfollow,noexec),
+ mount options=(ro,remount,bind,nosymfollow,nosuid),
+ mount options=(ro,remount,bind,nosymfollow,noexec,nodev),
+ mount options=(ro,remount,bind,nosymfollow,nosuid,nodev),
+ mount options=(ro,remount,bind,nosymfollow,nosuid,noexec),
+ mount options=(ro,remount,bind,nosymfollow,nosuid,noexec,nodev),
# allow moving mounts except for /proc, /sys and /dev
mount options=(rw,move) /[^spd]*{,/**},
diff --git a/config/templates/common.conf.in b/config/templates/common.conf.in
index 7fb109e049..311fbd44e8 100644
--- a/config/templates/common.conf.in
+++ b/config/templates/common.conf.in
@@ -15,35 +15,6 @@ lxc.cap.drop = mac_admin mac_override sys_time sys_module sys_rawio
# Ensure hostname is changed on clone
lxc.hook.clone = @LXCHOOKDIR@/clonehostname
-# Default legacy cgroup configuration
-#
-# CGroup allowlist
-lxc.cgroup.devices.deny = a
-## Allow any mknod (but not reading/writing the node)
-lxc.cgroup.devices.allow = c *:* m
-lxc.cgroup.devices.allow = b *:* m
-## Allow specific devices
-### /dev/null
-lxc.cgroup.devices.allow = c 1:3 rwm
-### /dev/zero
-lxc.cgroup.devices.allow = c 1:5 rwm
-### /dev/full
-lxc.cgroup.devices.allow = c 1:7 rwm
-### /dev/tty
-lxc.cgroup.devices.allow = c 5:0 rwm
-### /dev/console
-lxc.cgroup.devices.allow = c 5:1 rwm
-### /dev/ptmx
-lxc.cgroup.devices.allow = c 5:2 rwm
-### /dev/random
-lxc.cgroup.devices.allow = c 1:8 rwm
-### /dev/urandom
-lxc.cgroup.devices.allow = c 1:9 rwm
-### /dev/pts/*
-lxc.cgroup.devices.allow = c 136:* rwm
-### fuse
-lxc.cgroup.devices.allow = c 10:229 rwm
-
# Default unified cgroup configuration
#
# CGroup allowlist
diff --git a/config/templates/userns.conf.in b/config/templates/userns.conf.in
index 255dd01a35..b45f601fbf 100644
--- a/config/templates/userns.conf.in
+++ b/config/templates/userns.conf.in
@@ -1,10 +1,5 @@
# CAP_SYS_ADMIN in init-user-ns is required for cgroup.devices
#
-# Default legacy cgroup configuration
-#
-lxc.cgroup.devices.deny =
-lxc.cgroup.devices.allow =
-
# Default unified cgroup configuration
#
lxc.cgroup2.devices.deny =
diff --git a/doc/lxc.container.conf.sgml.in b/doc/lxc.container.conf.sgml.in
index 39efffbe56..ae58e5e3ba 100644
--- a/doc/lxc.container.conf.sgml.in
+++ b/doc/lxc.container.conf.sgml.in
@@ -1558,7 +1558,8 @@
ignore settings on systems that only use
the unified hierarchy. Conversely, it will ignore
options on systems that only use legacy
- hierarchies.
+ hierarchies. (legacy and hybrid hierarchy)
+ support is dropped.
diff --git a/meson.build b/meson.build
index ebbd560053..2098dbfe74 100644
--- a/meson.build
+++ b/meson.build
@@ -4,7 +4,7 @@
project(
'lxc',
'c',
- version: '6.0.0',
+ version: '7.0.0',
license: 'LGPLv2+',
default_options: [
'b_lto=true',
@@ -26,9 +26,9 @@ liblxc_dependencies = []
oss_fuzz_dependencies = []
# Version.
-liblxc_version = '1.8.0'
+liblxc_version = '1.9.0'
version_data = configuration_data()
-version_data.set('LXC_VERSION_MAJOR', '6')
+version_data.set('LXC_VERSION_MAJOR', '7')
version_data.set('LXC_VERSION_MINOR', '0')
version_data.set('LXC_VERSION_MICRO', '0')
version_data.set('LXC_VERSION_BETA', '')
diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c
index fcaea291fc..17f499b98f 100644
--- a/src/lxc/cgroups/cgfsng.c
+++ b/src/lxc/cgroups/cgfsng.c
@@ -184,138 +184,6 @@ int prepare_cgroup_fd(const struct cgroup_ops *ops, struct cgroup_fd *fd, bool l
return 0;
}
-/* Create cpumask from cpulist aka turn:
- *
- * 0,2-3
- *
- * into bit array
- *
- * 1 0 1 1
- */
-static int lxc_cpumask(char *buf, __u32 **bitarr, __u32 *last_set_bit)
-{
- __do_free __u32 *arr_u32 = NULL;
- __u32 cur_last_set_bit = 0, nbits = 256;
- __u32 nr_u32;
- char *token;
-
- nr_u32 = BITS_TO_LONGS(nbits);
- arr_u32 = zalloc(nr_u32 * sizeof(__u32));
- if (!arr_u32)
- return ret_errno(ENOMEM);
-
- lxc_iterate_parts(token, buf, ",") {
- __u32 last_bit, first_bit;
- char *range;
-
- errno = 0;
- first_bit = strtoul(token, NULL, 0);
- last_bit = first_bit;
- range = strchr(token, '-');
- if (range)
- last_bit = strtoul(range + 1, NULL, 0);
-
- if (!(first_bit <= last_bit))
- return ret_errno(EINVAL);
-
- if (last_bit >= nbits) {
- __u32 add_bits = last_bit - nbits + 32;
- __u32 new_nr_u32;
- __u32 *p;
-
- new_nr_u32 = BITS_TO_LONGS(nbits + add_bits);
- p = realloc(arr_u32, new_nr_u32 * sizeof(uint32_t));
- if (!p)
- return ret_errno(ENOMEM);
- arr_u32 = move_ptr(p);
-
- memset(arr_u32 + nr_u32, 0,
- (new_nr_u32 - nr_u32) * sizeof(uint32_t));
- nbits += add_bits;
- }
-
- while (first_bit <= last_bit)
- set_bit(first_bit++, arr_u32);
-
- if (last_bit > cur_last_set_bit)
- cur_last_set_bit = last_bit;
- }
-
- *last_set_bit = cur_last_set_bit;
- *bitarr = move_ptr(arr_u32);
- return 0;
-}
-
-static int lxc_cpumask_update(char *buf, __u32 *bitarr, __u32 last_set_bit,
- bool clear)
-{
- bool flipped = false;
- char *token;
-
- lxc_iterate_parts(token, buf, ",") {
- __u32 last_bit, first_bit;
- char *range;
-
- errno = 0;
- first_bit = strtoul(token, NULL, 0);
- last_bit = first_bit;
- range = strchr(token, '-');
- if (range)
- last_bit = strtoul(range + 1, NULL, 0);
-
- if (!(first_bit <= last_bit)) {
- WARN("The cup range seems to be inverted: %u-%u", first_bit, last_bit);
- continue;
- }
-
- if (last_bit > last_set_bit)
- continue;
-
- while (first_bit <= last_bit) {
- if (clear && is_set(first_bit, bitarr)) {
- flipped = true;
- clear_bit(first_bit, bitarr);
- } else if (!clear && !is_set(first_bit, bitarr)) {
- flipped = true;
- set_bit(first_bit, bitarr);
- }
-
- first_bit++;
- }
- }
-
- if (flipped)
- return 1;
-
- return 0;
-}
-
-/* Turn cpumask into simple, comma-separated cpulist. */
-static char *lxc_cpumask_to_cpulist(__u32 *bitarr, __u32 last_set_bit)
-{
- __do_free_string_list char **cpulist = NULL;
- char numstr[INTTYPE_TO_STRLEN(__u32)] = {0};
- int ret;
-
- for (__u32 bit = 0; bit <= last_set_bit; bit++) {
- if (!is_set(bit, bitarr))
- continue;
-
- ret = strnprintf(numstr, sizeof(numstr), "%u", bit);
- if (ret < 0)
- return NULL;
-
- ret = lxc_append_string(&cpulist, numstr);
- if (ret < 0)
- return ret_set_errno(NULL, ENOMEM);
- }
-
- if (!cpulist)
- return ret_set_errno(NULL, ENOMEM);
-
- return lxc_string_join(",", (const char **)cpulist, false);
-}
-
static inline bool is_unified_hierarchy(const struct hierarchy *h)
{
return h->fs_type == UNIFIED_HIERARCHY;
@@ -580,131 +448,8 @@ __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
SYSWARN("Failed to destroy cgroups");
}
-#define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
-#define __OFFLINE_CPUS "/sys/devices/system/cpu/offline"
-static bool cpuset1_cpus_initialize(int dfd_parent, int dfd_child,
- bool am_initialized)
-{
- __do_free char *cpulist = NULL, *isolcpus = NULL,
- *offlinecpus = NULL, *posscpus = NULL;
- __do_free __u32 *possmask = NULL;
- int ret;
- __u32 poss_last_set_bit = 0;
-
-#if !IS_BIONIC
- posscpus = read_file_at(dfd_parent, "cpuset.cpus", PROTECT_OPEN, 0);
-#else
- posscpus = read_file_at(dfd_parent, "cpus", PROTECT_OPEN, 0);
-#endif
- if (!posscpus)
- return log_error_errno(false, errno, "Failed to read file %d/cpuset.cpus", dfd_parent);
-
- if (file_exists(__ISOL_CPUS)) {
- isolcpus = read_file_at(-EBADF, __ISOL_CPUS, PROTECT_OPEN, 0);
- if (!isolcpus)
- return log_error_errno(false, errno, "Failed to read file \"%s\"", __ISOL_CPUS);
-
- if (!isdigit(isolcpus[0]))
- free_disarm(isolcpus);
- } else {
- TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
- }
-
- if (file_exists(__OFFLINE_CPUS)) {
- offlinecpus = read_file_at(-EBADF, __OFFLINE_CPUS, PROTECT_OPEN, 0);
- if (!offlinecpus)
- return log_error_errno(false, errno, "Failed to read file \"%s\"", __OFFLINE_CPUS);
-
- if (!isdigit(offlinecpus[0]))
- free_disarm(offlinecpus);
- } else {
- TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist");
- }
-
- if (!isolcpus && !offlinecpus) {
- cpulist = move_ptr(posscpus);
- goto copy_parent;
- }
-
- ret = lxc_cpumask(posscpus, &possmask, &poss_last_set_bit);
- if (ret)
- return log_error_errno(false, errno, "Failed to create cpumask for possible cpus");
-
- if (isolcpus)
- ret = lxc_cpumask_update(isolcpus, possmask, poss_last_set_bit, true);
-
- if (offlinecpus)
- ret |= lxc_cpumask_update(offlinecpus, possmask, poss_last_set_bit, true);
-
- if (!ret) {
- cpulist = lxc_cpumask_to_cpulist(possmask, poss_last_set_bit);
- TRACE("No isolated or offline cpus present in cpuset");
- } else {
- cpulist = move_ptr(posscpus);
- TRACE("Removed isolated or offline cpus from cpuset");
- }
- if (!cpulist)
- return log_error_errno(false, errno, "Failed to create cpu list");
-
-copy_parent:
- if (!am_initialized) {
-#if !IS_BIONIC
- ret = lxc_writeat(dfd_child, "cpuset.cpus", cpulist, strlen(cpulist));
-#else
- ret = lxc_writeat(dfd_child, "cpus", cpulist, strlen(cpulist));
-#endif
- if (ret < 0)
- return log_error_errno(false, errno, "Failed to write cpu list to \"%d/cpuset.cpus\"", dfd_child);
-
- TRACE("Copied cpu settings of parent cgroup");
- }
-
- return true;
-}
-
-static bool cpuset1_initialize(int dfd_base, int dfd_next)
-{
- char mems[PATH_MAX];
- ssize_t bytes;
- char v;
-
- /* Determine whether the base cgroup has cpuset inheritance turned on. */
- bytes = lxc_readat(dfd_base, "cgroup.clone_children", &v, 1);
- if (bytes < 0)
- return syserror_ret(false, "Failed to read file %d(cgroup.clone_children)", dfd_base);
-
- /* Initialize cpuset.cpus removing any isolated and offline cpus. */
- if (!cpuset1_cpus_initialize(dfd_base, dfd_next, v == '1'))
- return syserror_ret(false, "Failed to initialize cpuset.cpus");
-
- /* Read cpuset.mems from parent... */
-#if !IS_BIONIC
- bytes = lxc_readat(dfd_base, "cpuset.mems", mems, sizeof(mems));
-#else
- bytes = lxc_readat(dfd_base, "mems", mems, sizeof(mems));
-#endif
- if (bytes < 0)
- return syserror_ret(false, "Failed to read file %d(cpuset.mems)", dfd_base);
-
- /* and copy to first cgroup in the tree... */
-#if !IS_BIONIC
- bytes = lxc_writeat(dfd_next, "cpuset.mems", mems, bytes);
-#else
- bytes = lxc_writeat(dfd_next, "mems", mems, bytes);
-#endif
- if (bytes < 0)
- return syserror_ret(false, "Failed to write %d(cpuset.mems)", dfd_next);
-
- /* and finally turn on cpuset inheritance. */
- bytes = lxc_writeat(dfd_next, "cgroup.clone_children", "1", 1);
- if (bytes < 0)
- return syserror_ret(false, "Failed to write %d(cgroup.clone_children)", dfd_next);
-
- return log_trace(true, "Initialized cpuset in the legacy hierarchy");
-}
-
static int __cgroup_tree_create(int dfd_base, const char *path, mode_t mode,
- bool cpuset_v1, bool eexist_ignore)
+ bool eexist_ignore)
{
__do_close int dfd_final = -EBADF;
int dfd_cur = dfd_base;
@@ -747,8 +492,7 @@ static int __cgroup_tree_create(int dfd_base, const char *path, mode_t mode,
!ret ? " newly created" : "", dfd_base, cur);
if (dfd_cur != dfd_base)
close(dfd_cur);
- else if (cpuset_v1 && !cpuset1_initialize(dfd_base, dfd_final))
- return syserror_set(-EINVAL, "Failed to initialize cpuset controller in the legacy hierarchy");
+
/*
* Leave dfd_final pointing to the last fd we opened so
* it will be automatically zapped if we return early.
@@ -771,17 +515,10 @@ static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf,
const char *cgroup_leaf, bool payload)
{
__do_close int fd_limit = -EBADF, fd_final = -EBADF;
- bool cpuset_v1 = false;
-
- /*
- * The legacy cpuset controller needs massaging in case inheriting
- * settings from its immediate ancestor cgroup hasn't been turned on.
- */
- cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
if (payload && cgroup_leaf) {
/* With isolation both parts need to not already exist. */
- fd_limit = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
+ fd_limit = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, false);
if (fd_limit < 0)
return syswarn_ret(false, "Failed to create limiting cgroup %d(%s)", h->dfd_base, cgroup_limit_dir);
@@ -791,21 +528,11 @@ static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf,
TRACE("Created limit cgroup %d->%d(%s)",
h->dfd_lim, h->dfd_base, cgroup_limit_dir);
- /*
- * With isolation the devices legacy cgroup needs to be
- * iinitialized early, as it typically contains an 'a' (all)
- * line, which is not possible once a subdirectory has been
- * created.
- */
- if (string_in_list(h->controllers, "devices") &&
- !ops->setup_limits_legacy(ops, conf, true))
- return log_warn(false, "Failed to setup legacy device limits");
-
/*
* If we use a separate limit cgroup, the leaf cgroup, i.e. the
* cgroup the container actually resides in, is below fd_limit.
*/
- fd_final = __cgroup_tree_create(h->dfd_lim, cgroup_leaf, 0755, cpuset_v1, false);
+ fd_final = __cgroup_tree_create(h->dfd_lim, cgroup_leaf, 0755, false);
if (fd_final < 0) {
/* Ensure we don't leave any garbage behind. */
if (cgroup_tree_prune(h->dfd_base, cgroup_limit_dir))
@@ -818,7 +545,7 @@ static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf,
h->path_con = must_make_path(h->path_lim, cgroup_leaf, NULL);
} else {
- fd_final = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
+ fd_final = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, false);
if (fd_final < 0)
return syswarn_ret(false, "Failed to create %s cgroup %d(%s)", payload ? "payload" : "monitor", h->dfd_base, cgroup_limit_dir);
@@ -905,7 +632,6 @@ __cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
__do_close int fd_pivot = -EBADF;
__do_free char *pivot_path = NULL;
struct hierarchy *h = ops->hierarchies[i];
- bool cpuset_v1 = false;
int ret;
/* Monitor might have died before we entered the cgroup. */
@@ -921,9 +647,7 @@ __cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
else
pivot_path = must_make_path(CGROUP_PIVOT, NULL);
- cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
-
- fd_pivot = __cgroup_tree_create(h->dfd_base, pivot_path, 0755, cpuset_v1, true);
+ fd_pivot = __cgroup_tree_create(h->dfd_base, pivot_path, 0755, true);
if (fd_pivot < 0) {
SYSWARN("Failed to create pivot cgroup %d(%s)", h->dfd_base, pivot_path);
continue;
@@ -2089,75 +1813,6 @@ __cgfsng_ops static void cgfsng_finalize(struct cgroup_ops *ops)
}
}
-/* cgroup-full:* is done, no need to create subdirs */
-static inline bool cg_mount_needs_subdirs(int cgroup_automount_type)
-{
- switch (cgroup_automount_type) {
- case LXC_AUTO_CGROUP_RO:
- return true;
- case LXC_AUTO_CGROUP_RW:
- return true;
- case LXC_AUTO_CGROUP_MIXED:
- return true;
- }
-
- return false;
-}
-
-/* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
- * remount controller ro if needed and bindmount the cgroupfs onto
- * control/the/cg/path.
- */
-static int cg_legacy_mount_controllers(int cgroup_automount_type, struct hierarchy *h,
- char *hierarchy_mnt, char *cgpath,
- const char *container_cgroup)
-{
- __do_free char *sourcepath = NULL;
- int ret, remount_flags;
- int flags = MS_BIND;
-
- if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
- (cgroup_automount_type == LXC_AUTO_CGROUP_MIXED)) {
- ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup", MS_BIND, NULL);
- if (ret < 0)
- return log_error_errno(-1, errno, "Failed to bind mount \"%s\" onto \"%s\"",
- hierarchy_mnt, hierarchy_mnt);
-
- remount_flags = add_required_remount_flags(hierarchy_mnt,
- hierarchy_mnt,
- flags | MS_REMOUNT);
- ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup",
- remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
- NULL);
- if (ret < 0)
- return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", hierarchy_mnt);
-
- INFO("Remounted %s read-only", hierarchy_mnt);
- }
-
- sourcepath = make_cgroup_path(h, h->at_base, container_cgroup, NULL);
- if (cgroup_automount_type == LXC_AUTO_CGROUP_RO)
- flags |= MS_RDONLY;
-
- ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
- if (ret < 0)
- return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"%s\"",
- h->controllers[0], cgpath);
- INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
-
- if (flags & MS_RDONLY) {
- remount_flags = add_required_remount_flags(sourcepath, cgpath,
- flags | MS_REMOUNT);
- ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
- if (ret < 0)
- return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", cgpath);
- INFO("Remounted %s read-only", cgpath);
- }
-
- INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
- return 0;
-}
-
/* __cgroupfs_mount
*
* Mount cgroup hierarchies directly without using bind-mounts. The main
@@ -2170,7 +1825,7 @@ static int __cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
{
__do_close int fd_fs = -EBADF;
unsigned int flags = 0;
- char *fstype;
+ char *fstype = "cgroup2";
int ret;
if (dfd_mnt_cgroupfs < 0)
@@ -2186,49 +1841,27 @@ static int __cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
(cgroup_automount_type == LXC_AUTO_CGROUP2_RO))
flags |= MOUNT_ATTR_RDONLY;
- if (is_unified_hierarchy(h))
- fstype = "cgroup2";
- else
- fstype = "cgroup";
-
- if (can_use_mount_api()) {
- fd_fs = fs_prepare(fstype, -EBADF, "", 0, 0);
- if (fd_fs < 0)
- return log_error_errno(-errno, errno, "Failed to prepare filesystem context for %s", fstype);
-
- if (!is_unified_hierarchy(h)) {
- for (const char **it = (const char **)h->controllers; it && *it; it++) {
- if (strnequal(*it, "name=", STRLITERALLEN("name=")))
- ret = fs_set_property(fd_fs, "name", *it + STRLITERALLEN("name="));
- else
- ret = fs_set_property(fd_fs, *it, "");
- if (ret < 0)
- return log_error_errno(-errno, errno, "Failed to add %s controller to cgroup filesystem context %d(dev)", *it, fd_fs);
- }
- }
+ if (!is_unified_hierarchy(h))
+ return ret_errno(EOPNOTSUPP);
- ret = fs_attach(fd_fs, dfd_mnt_cgroupfs, hierarchy_mnt,
- PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH,
- flags);
- } else {
- __do_free char *controllers = NULL, *target = NULL;
- unsigned int old_flags = 0;
- const char *rootfs_mnt;
+ fd_fs = fs_prepare(fstype, -EBADF, "", 0, 0);
+ if (fd_fs < 0)
+ return log_error_errno(-errno, errno, "Failed to prepare filesystem context for %s", fstype);
- if (!is_unified_hierarchy(h)) {
- controllers = lxc_string_join(",", (const char **)h->controllers, false);
- if (!controllers)
- return ret_errno(ENOMEM);
+ if (!is_unified_hierarchy(h)) {
+ for (const char **it = (const char **)h->controllers; it && *it; it++) {
+ if (strnequal(*it, "name=", STRLITERALLEN("name=")))
+ ret = fs_set_property(fd_fs, "name", *it + STRLITERALLEN("name="));
+ else
+ ret = fs_set_property(fd_fs, *it, "");
+ if (ret < 0)
+ return log_error_errno(-errno, errno, "Failed to add %s controller to cgroup filesystem context %d(dev)", *it, fd_fs);
}
-
- rootfs_mnt = get_rootfs_mnt(rootfs);
- ret = mnt_attributes_old(flags, &old_flags);
- if (ret)
- return log_error_errno(-EINVAL, EINVAL, "Unsupported mount properties specified");
-
- target = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, hierarchy_mnt, NULL);
- ret = safe_mount(NULL, target, fstype, old_flags, controllers, rootfs_mnt);
}
+
+ ret = fs_attach(fd_fs, dfd_mnt_cgroupfs, hierarchy_mnt,
+ PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH,
+ flags);
if (ret < 0)
return log_error_errno(ret, errno, "Failed to mount %s filesystem onto %d(%s)",
fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
@@ -2246,26 +1879,6 @@ static inline int cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
dfd_mnt_cgroupfs, hierarchy_mnt);
}
-static inline int cgroupfs_bind_mount(int cgroup_automount_type, struct hierarchy *h,
- struct lxc_rootfs *rootfs,
- int dfd_mnt_cgroupfs,
- const char *hierarchy_mnt)
-{
- switch (cgroup_automount_type) {
- case LXC_AUTO_CGROUP_FULL_RO:
- break;
- case LXC_AUTO_CGROUP_FULL_RW:
- break;
- case LXC_AUTO_CGROUP_FULL_MIXED:
- break;
- default:
- return 0;
- }
-
- return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
- dfd_mnt_cgroupfs, hierarchy_mnt);
-}
-
__cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
struct lxc_handler *handler, int cg_flags)
{
@@ -2275,7 +1888,6 @@ __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
bool in_cgroup_ns = false, wants_force_mount = false;
struct lxc_conf *conf = handler->conf;
struct lxc_rootfs *rootfs = &conf->rootfs;
- const char *rootfs_mnt = get_rootfs_mnt(rootfs);
int ret;
if (!ops)
@@ -2419,93 +2031,7 @@ __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
return syserror_ret(false, "Failed to mount cgroups");
}
- /*
- * Mount a tmpfs over DEFAULT_CGROUP_MOUNTPOINT. Note that we're
- * relying on RESOLVE_BENEATH so we need to skip the leading "/" in the
- * DEFAULT_CGROUP_MOUNTPOINT define.
- */
- if (can_use_mount_api()) {
- fd_fs = fs_prepare("tmpfs", -EBADF, "", 0, 0);
- if (fd_fs < 0)
- return log_error_errno(false, errno, "Failed to create new filesystem context for tmpfs");
-
- ret = fs_set_property(fd_fs, "mode", "0755");
- if (ret < 0)
- return log_error_errno(false, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
-
- ret = fs_set_property(fd_fs, "size", "10240k");
- if (ret < 0)
- return log_error_errno(false, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
-
- ret = fs_attach(fd_fs, rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
- PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV,
- MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV |
- MOUNT_ATTR_NOEXEC | MOUNT_ATTR_RELATIME);
- } else {
- cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
- ret = safe_mount(NULL, cgroup_root, "tmpfs",
- MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
- "size=10240k,mode=755", rootfs_mnt);
- }
- if (ret < 0)
- return log_error_errno(false, errno, "Failed to mount tmpfs on %s",
- DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
-
- dfd_mnt_tmpfs = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
- PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
- if (dfd_mnt_tmpfs < 0)
- return syserror_ret(false, "Failed to open %d(%s)",
- rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
-
- for (int i = 0; ops->hierarchies[i]; i++) {
- __do_free char *hierarchy_mnt = NULL, *path2 = NULL;
- struct hierarchy *h = ops->hierarchies[i];
-
- ret = mkdirat(dfd_mnt_tmpfs, h->at_mnt, 0000);
- if (ret < 0)
- return syserror_ret(false, "Failed to create cgroup at_mnt %d(%s)", dfd_mnt_tmpfs, h->at_mnt);
-
- if (in_cgroup_ns && wants_force_mount) {
- /*
- * If cgroup namespaces are supported but the container
- * will not have CAP_SYS_ADMIN after it has started we
- * need to mount the cgroups manually.
- */
- ret = cgroupfs_mount(cgroup_automount_type, h, rootfs,
- dfd_mnt_tmpfs, h->at_mnt);
- if (ret < 0)
- return false;
-
- continue;
- }
-
- /* Here is where the ancient kernel section begins. */
- ret = cgroupfs_bind_mount(cgroup_automount_type, h, rootfs,
- dfd_mnt_tmpfs, h->at_mnt);
- if (ret < 0)
- return false;
-
- if (!cg_mount_needs_subdirs(cgroup_automount_type))
- continue;
-
- if (!cgroup_root)
- cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
-
- hierarchy_mnt = must_make_path(cgroup_root, h->at_mnt, NULL);
- path2 = must_make_path(hierarchy_mnt, h->at_base,
- ops->container_cgroup, NULL);
- ret = lxc_mkdir_p(path2, 0755);
- if (ret < 0 && (errno != EEXIST))
- return false;
-
- ret = cg_legacy_mount_controllers(cgroup_automount_type, h,
- hierarchy_mnt, path2,
- ops->container_cgroup);
- if (ret < 0)
- return false;
- }
-
- return true;
+ return syserror_ret(false, "Failed to mount cgroups - unsupported cgroup layout");
}
/* Only root needs to escape to the cgroup of its init. */
@@ -2576,18 +2102,6 @@ __cgfsng_ops static bool cgfsng_criu_get_hierarchies(struct cgroup_ops *ops,
return true;
}
-static int cg_legacy_freeze(struct cgroup_ops *ops)
-{
- struct hierarchy *h;
-
- h = get_hierarchy(ops, "freezer");
- if (!h)
- return ret_set_errno(-1, ENOENT);
-
- return lxc_write_openat(h->path_con, "freezer.state",
- "FROZEN", STRLITERALLEN("FROZEN"));
-}
-
static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata,
struct lxc_async_descr *descr)
{
@@ -2680,24 +2194,12 @@ __cgfsng_ops static int cgfsng_freeze(struct cgroup_ops *ops, int timeout)
if (!ops->hierarchies)
return ret_set_errno(-1, ENOENT);
- if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
- return cg_legacy_freeze(ops);
+ if (!pure_unified_layout(ops))
+ return ret_set_errno(-1, EOPNOTSUPP);
return cg_unified_freeze(ops, timeout);
}
-static int cg_legacy_unfreeze(struct cgroup_ops *ops)
-{
- struct hierarchy *h;
-
- h = get_hierarchy(ops, "freezer");
- if (!h)
- return ret_set_errno(-1, ENOENT);
-
- return lxc_write_openat(h->path_con, "freezer.state",
- "THAWED", STRLITERALLEN("THAWED"));
-}
-
static int cg_unified_unfreeze(struct cgroup_ops *ops, int timeout)
{
return cg_unified_freeze_do(ops, timeout, "0", 0,
@@ -2710,8 +2212,8 @@ __cgfsng_ops static int cgfsng_unfreeze(struct cgroup_ops *ops, int timeout)
if (!ops->hierarchies)
return ret_set_errno(-1, ENOENT);
- if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
- return cg_legacy_unfreeze(ops);
+ if (!pure_unified_layout(ops))
+ return ret_set_errno(-1, EOPNOTSUPP);
return cg_unified_unfreeze(ops, timeout);
}
@@ -3370,135 +2872,6 @@ static int device_cgroup_rule_parse_devpath(struct device_item *device,
return 0;
}
-static int convert_devpath(const char *invalue, char *dest)
-{
- struct device_item device = {};
- int ret;
-
- ret = device_cgroup_rule_parse_devpath(&device, invalue);
- if (ret < 0)
- return -1;
-
- ret = strnprintf(dest, 50, "%c %d:%d %s", device.type, device.major,
- device.minor, device.access);
- if (ret < 0)
- return log_error_errno(ret, -ret,
- "Error on configuration value \"%c %d:%d %s\" (max 50 chars)",
- device.type, device.major, device.minor,
- device.access);
-
- return 0;
-}
-
-/* Called from setup_limits - here we have the container's cgroup_data because
- * we created the cgroups.
- */
-static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
- const char *value, bool is_cpuset)
-{
- __do_free char *controller = NULL;
- char *p;
- /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
- char converted_value[50];
- struct hierarchy *h;
-
- controller = strdup(filename);
- if (!controller)
- return ret_errno(ENOMEM);
-
- p = strchr(controller, '.');
- if (p)
- *p = '\0';
-
- if (strequal("devices.allow", filename) && value[0] == '/') {
- int ret;
-
- ret = convert_devpath(value, converted_value);
- if (ret < 0)
- return ret;
- value = converted_value;
- }
-
- h = get_hierarchy(ops, controller);
- if (!h)
- return log_error_errno(-ENOENT, ENOENT, "Failed to setup limits for the \"%s\" controller. The controller seems to be unused by \"cgfsng\" cgroup driver or not enabled on the cgroup hierarchy", controller);
-
- if (is_cpuset) {
- int ret = lxc_write_openat(h->path_con, filename, value, strlen(value));
- if (ret)
- return ret;
- }
- return lxc_write_openat(h->path_lim, filename, value, strlen(value));
-}
-
-/*
- * Return the list of cgroup_settings sorted according to the following rules
- * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
- */
-static void sort_cgroup_settings(struct lxc_conf *conf)
-{
- LIST_HEAD(memsw_list);
- struct lxc_cgroup *cgroup, *ncgroup;
-
- /* Iterate over the cgroup settings and copy them to the output list. */
- list_for_each_entry_safe(cgroup, ncgroup, &conf->cgroup, head) {
- if (!strequal(cgroup->subsystem, "memory.memsw.limit_in_bytes"))
- continue;
-
- /* Move the memsw entry from the cgroup settings list. */
- list_move_tail(&cgroup->head, &memsw_list);
- }
-
- /*
- * Append all the memsw entries to the end of the cgroup settings list
- * to make sure they are applied after all memory limit settings.
- */
- list_splice_tail(&memsw_list, &conf->cgroup);
-
-}
-
-__cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops,
- struct lxc_conf *conf,
- bool do_devices)
-{
- struct list_head *cgroup_settings;
- struct lxc_cgroup *cgroup;
-
- if (!ops)
- return ret_set_errno(false, ENOENT);
-
- if (!conf)
- return ret_set_errno(false, EINVAL);
-
- cgroup_settings = &conf->cgroup;
- if (list_empty(cgroup_settings))
- return true;
-
- if (!ops->hierarchies)
- return ret_set_errno(false, EINVAL);
-
- if (pure_unified_layout(ops))
- return log_warn_errno(true, EINVAL, "Ignoring legacy cgroup limits on pure cgroup2 system");
-
- sort_cgroup_settings(conf);
- list_for_each_entry(cgroup, cgroup_settings, head) {
- if (do_devices == strnequal("devices", cgroup->subsystem, 7)) {
- if (cg_legacy_set_data(ops, cgroup->subsystem, cgroup->value, strnequal("cpuset", cgroup->subsystem, 6))) {
- if (do_devices && (errno == EACCES || errno == EPERM)) {
- SYSWARN("Failed to set \"%s\" to \"%s\"", cgroup->subsystem, cgroup->value);
- continue;
- }
- SYSERROR("Failed to set \"%s\" to \"%s\"", cgroup->subsystem, cgroup->value);
- return false;
- }
- DEBUG("Set controller \"%s\" set to \"%s\"", cgroup->subsystem, cgroup->value);
- }
- }
-
- INFO("Limits for the legacy cgroup hierarchies have been setup");
- return true;
-}
-
/*
* Some of the parsing logic comes from the original cgroup device v1
* implementation in the kernel.
@@ -4209,7 +3582,6 @@ struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf)
cgfsng_ops->set = cgfsng_set;
cgfsng_ops->freeze = cgfsng_freeze;
cgfsng_ops->unfreeze = cgfsng_unfreeze;
- cgfsng_ops->setup_limits_legacy = cgfsng_setup_limits_legacy;
cgfsng_ops->setup_limits = cgfsng_setup_limits;
cgfsng_ops->driver = "cgfsng";
cgfsng_ops->version = "1.0.0";
diff --git a/src/lxc/cgroups/cgroup.c b/src/lxc/cgroups/cgroup.c
index 5e2a7d0993..b8029dade5 100644
--- a/src/lxc/cgroups/cgroup.c
+++ b/src/lxc/cgroups/cgroup.c
@@ -40,14 +40,11 @@ struct cgroup_ops *cgroup_init(struct lxc_conf *conf)
TRACE("Initialized cgroup driver %s", cgroup_ops->driver);
- if (cgroup_ops->cgroup_layout == CGROUP_LAYOUT_LEGACY)
- TRACE("Legacy cgroup layout");
- else if (cgroup_ops->cgroup_layout == CGROUP_LAYOUT_HYBRID)
- TRACE("Hybrid cgroup layout");
- else if (cgroup_ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED)
+ if (pure_unified_layout(cgroup_ops)) {
TRACE("Unified cgroup layout");
- else
- WARN("Unsupported cgroup layout");
+ } else {
+ WARN("Unsupported cgroup layout (%s)", cgroup_layout_name(cgroup_ops->cgroup_layout));
+ }
return cgroup_ops;
}
diff --git a/src/lxc/cgroups/cgroup.h b/src/lxc/cgroups/cgroup.h
index 108e5d84ec..54c34530b9 100644
--- a/src/lxc/cgroups/cgroup.h
+++ b/src/lxc/cgroups/cgroup.h
@@ -262,8 +262,6 @@ struct cgroup_ops {
size_t len, const char *name, const char *lxcpath);
int (*freeze)(struct cgroup_ops *ops, int timeout);
int (*unfreeze)(struct cgroup_ops *ops, int timeout);
- bool (*setup_limits_legacy)(struct cgroup_ops *ops,
- struct lxc_conf *conf, bool with_devices);
bool (*setup_limits)(struct cgroup_ops *ops, struct lxc_handler *handler);
bool (*chown)(struct cgroup_ops *ops, struct lxc_conf *conf);
bool (*attach)(struct cgroup_ops *ops, const struct lxc_conf *conf,
diff --git a/src/lxc/cmd/lxc_user_nic.c b/src/lxc/cmd/lxc_user_nic.c
index 98aedf8216..83fd84a185 100644
--- a/src/lxc/cmd/lxc_user_nic.c
+++ b/src/lxc/cmd/lxc_user_nic.c
@@ -374,19 +374,58 @@ static char *get_eow(char *s, char *e)
return s;
}
+static bool same_word(const char *start, const char *end, const char *word)
+{
+ size_t wordlen = strlen(word);
+ size_t buflen = end - start;
+
+ if (wordlen != buflen)
+ return false;
+ if (strncmp(start, word, wordlen) == 0)
+ return true;
+ return false;
+}
+
+/*
+ * in:
+ * @buf_start and @buf_end point to the buffer to be read.
+ *
+ * @owner_name is the name of the user who should own the link.
+ *
+ * @net_type is type of connection, e.g. veth
+ *
+ * @net_link is the name of the bridge, e.g. lxcbr0, on which the
+ * device should live.
+ *
+ * @net_dev is the name of the device itself in the host netns.
+ *
+ * out:
+ * @is_owner is set to true if the current line is owned by @name.
+
+ * @nic_found is set to true if the line is specifically for the passed-in
+ * @net_dev, and it is on the right @net_link and of the right @net_type.
+ *
+ * @exists is set to false if the nic in this line no longer exists. This is
+ * used by cull_entries(): if we set it to false, then this line will be
+ * removed from the LXC_USERNIC_DB (e.g. /var/run/lxc/nics).
+ */
static char *find_line(char *buf_start, char *buf_end, char *name,
char *net_type, char *net_link, char *net_dev,
- bool *owner, bool *found, bool *keep)
+ bool *is_owner, bool *nic_found, bool *exists)
{
char *end_of_line, *end_of_word, *line;
+ bool right_net_type, right_bridge, right_link_name;;
while (buf_start < buf_end) {
size_t len;
char netdev_name[IFNAMSIZ];
- *found = false;
- *keep = true;
- *owner = false;
+ *nic_found = false;
+ *exists = true;
+ *is_owner = false;
+ right_net_type = false;
+ right_bridge = false;
+ right_link_name = false;
end_of_line = get_eol(buf_start, buf_end);
if (end_of_line >= buf_end)
@@ -405,11 +444,8 @@ static char *find_line(char *buf_start, char *buf_end, char *name,
if (!end_of_word)
return NULL;
- if (strncmp(buf_start, name, strlen(name)))
- *found = false;
- else
- if (strlen(name) == (size_t)(end_of_word - buf_start))
- *owner = true;
+ if (same_word(buf_start, end_of_word, name))
+ *is_owner = true;
buf_start = end_of_word + 1;
while ((buf_start < buf_end) && isblank(*buf_start))
@@ -421,8 +457,8 @@ static char *find_line(char *buf_start, char *buf_end, char *name,
if (!end_of_word)
return NULL;
- if (strncmp(buf_start, net_type, strlen(net_type)))
- *found = false;
+ if (same_word(buf_start, end_of_word, net_type))
+ right_net_type = true;
buf_start = end_of_word + 1;
while ((buf_start < buf_end) && isblank(*buf_start))
@@ -434,8 +470,8 @@ static char *find_line(char *buf_start, char *buf_end, char *name,
if (!end_of_word)
return NULL;
- if (strncmp(buf_start, net_link, strlen(net_link)))
- *found = false;
+ if (same_word(buf_start, end_of_word, net_link))
+ right_bridge = true;
buf_start = end_of_word + 1;
while ((buf_start < buf_end) && isblank(*buf_start))
@@ -454,10 +490,13 @@ static char *find_line(char *buf_start, char *buf_end, char *name,
memcpy(netdev_name, buf_start, len);
netdev_name[len] = '\0';
- *keep = lxc_nic_exists(netdev_name);
+ *exists = lxc_nic_exists(netdev_name);
if (net_dev && !strcmp(netdev_name, net_dev))
- *found = true;
+ right_link_name = true;
+
+ if (right_net_type && right_bridge && right_link_name)
+ *nic_found = true;
return line;
@@ -587,7 +626,7 @@ static bool cull_entries(int fd, char *name, char *net_type, char *net_link,
size_t length = 0;
int ret;
char *buf_end, *buf_start;
- bool found, keep;
+ bool nic_found, is_owner, keep;
ret = fd_to_buf(fd, &buf, &length);
if (ret < 0) {
@@ -603,7 +642,7 @@ static bool cull_entries(int fd, char *name, char *net_type, char *net_link,
buf_start = buf;
buf_end = buf + length;
while ((buf_start = find_line(buf_start, buf_end, name, net_type,
- net_link, net_dev, &(bool){true}, &found,
+ net_link, net_dev, &is_owner, &nic_found,
&keep))) {
struct entry_line *newe;
@@ -611,7 +650,7 @@ static bool cull_entries(int fd, char *name, char *net_type, char *net_link,
if (!newe)
return false;
- if (found)
+ if (nic_found && is_owner)
*found_nicname = true;
entry_lines = newe;
diff --git a/src/lxc/conf.c b/src/lxc/conf.c
index 571b3fd203..4d366d04b5 100644
--- a/src/lxc/conf.c
+++ b/src/lxc/conf.c
@@ -808,19 +808,16 @@ static int lxc_setup_ttys(struct lxc_conf *conf)
"Failed to unlink %d(%s)",
rootfs->dfd_dev, tty_name);
- if (can_use_mount_api())
- ret = fd_bind_mount(tty->pty, "",
- PROTECT_OPATH_FILE,
- PROTECT_LOOKUP_BENEATH_XDEV,
- fd_to, "",
- PROTECT_OPATH_FILE,
- PROTECT_LOOKUP_BENEATH_XDEV,
- 0,
- 0,
- 0,
- false);
- else
- ret = mount_fd(tty->pty, fd_to, "none", MS_BIND, 0);
+ ret = fd_bind_mount(tty->pty, "",
+ PROTECT_OPATH_FILE,
+ PROTECT_LOOKUP_BENEATH_XDEV,
+ fd_to, "",
+ PROTECT_OPATH_FILE,
+ PROTECT_LOOKUP_BENEATH_XDEV,
+ 0,
+ 0,
+ 0,
+ false);
if (ret < 0)
return log_error_errno(-errno, errno,
"Failed to bind mount \"%s\" onto \"%s\"",
@@ -845,19 +842,16 @@ static int lxc_setup_ttys(struct lxc_conf *conf)
"Failed to create tty mount target %d(%s)",
rootfs->dfd_dev, rootfs->buf);
- if (can_use_mount_api())
- ret = fd_bind_mount(tty->pty, "",
- PROTECT_OPATH_FILE,
- PROTECT_LOOKUP_BENEATH_XDEV,
- fd_to, "",
- PROTECT_OPATH_FILE,
- PROTECT_LOOKUP_BENEATH,
- 0,
- 0,
- 0,
- false);
- else
- ret = mount_fd(tty->pty, fd_to, "none", MS_BIND, 0);
+ ret = fd_bind_mount(tty->pty, "",
+ PROTECT_OPATH_FILE,
+ PROTECT_LOOKUP_BENEATH_XDEV,
+ fd_to, "",
+ PROTECT_OPATH_FILE,
+ PROTECT_LOOKUP_BENEATH,
+ 0,
+ 0,
+ 0,
+ false);
if (ret < 0)
return log_error_errno(-errno, errno,
"Failed to bind mount \"%s\" onto \"%s\"",
@@ -1017,37 +1011,23 @@ static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs,
goto reset_umask;
}
- if (can_use_mount_api()) {
- fd_fs = fs_prepare("tmpfs", -EBADF, "", 0, 0);
- if (fd_fs < 0)
- return log_error_errno(-errno, errno, "Failed to prepare filesystem context for tmpfs");
+ fd_fs = fs_prepare("tmpfs", -EBADF, "", 0, 0);
+ if (fd_fs < 0)
+ return log_error_errno(-errno, errno, "Failed to prepare filesystem context for tmpfs");
- sprintf(mount_options, "%zu", tmpfs_size);
+ sprintf(mount_options, "%zu", tmpfs_size);
- ret = fs_set_property(fd_fs, "mode", "0755");
- if (ret < 0)
- return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
-
- ret = fs_set_property(fd_fs, "size", mount_options);
- if (ret < 0)
- return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
-
- ret = fs_attach(fd_fs, rootfs->dfd_mnt, "dev",
- PROTECT_OPATH_DIRECTORY,
- PROTECT_LOOKUP_BENEATH_XDEV, 0);
- } else {
- __do_free char *fallback_path = NULL;
+ ret = fs_set_property(fd_fs, "mode", "0755");
+ if (ret < 0)
+ return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
- sprintf(mount_options, "size=%zu,mode=755", tmpfs_size);
- DEBUG("Using mount options: %s", mount_options);
+ ret = fs_set_property(fd_fs, "size", mount_options);
+ if (ret < 0)
+ return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
- if (path) {
- fallback_path = must_make_path(path, "/dev", NULL);
- ret = safe_mount("none", fallback_path, "tmpfs", 0, mount_options, path);
- } else {
- ret = safe_mount("none", "dev", "tmpfs", 0, mount_options, NULL);
- }
- }
+ ret = fs_attach(fd_fs, rootfs->dfd_mnt, "dev",
+ PROTECT_OPATH_DIRECTORY,
+ PROTECT_LOOKUP_BENEATH_XDEV, 0);
if (ret < 0) {
SYSERROR("Failed to mount tmpfs on \"%s\"", path);
goto reset_umask;
@@ -1160,35 +1140,16 @@ static int lxc_fill_autodev(struct lxc_rootfs *rootfs)
if (ret < 0)
return ret_errno(EIO);
- if (can_use_mount_api()) {
- ret = fd_bind_mount(rootfs->dfd_host, rootfs->buf,
- PROTECT_OPATH_FILE,
- PROTECT_LOOKUP_BENEATH_XDEV,
- rootfs->dfd_dev, device->name,
- PROTECT_OPATH_FILE,
- PROTECT_LOOKUP_BENEATH,
- 0,
- 0,
- 0,
- false);
- } else {
- char path[PATH_MAX];
-
- ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "/dev/%s", device->name);
- if (ret < 0)
- return ret_errno(EIO);
-
- ret = strnprintf(path, sizeof(path), "%s/dev/%s", get_rootfs_mnt(rootfs), device->name);
- if (ret < 0)
- return log_error(-1, "Failed to create device path for %s", device->name);
-
- ret = safe_mount(rootfs->buf, path, 0, MS_BIND, NULL, get_rootfs_mnt(rootfs));
- if (ret < 0)
- return log_error_errno(-1, errno, "Failed to bind mount host device node \"%s\" to \"%s\"", rootfs->buf, path);
-
- DEBUG("Bind mounted host device node \"%s\" to \"%s\"", rootfs->buf, path);
- continue;
- }
+ ret = fd_bind_mount(rootfs->dfd_host, rootfs->buf,
+ PROTECT_OPATH_FILE,
+ PROTECT_LOOKUP_BENEATH_XDEV,
+ rootfs->dfd_dev, device->name,
+ PROTECT_OPATH_FILE,
+ PROTECT_LOOKUP_BENEATH,
+ 0,
+ 0,
+ 0,
+ false);
DEBUG("Bind mounted host device %d(%s) to %d(%s)", rootfs->dfd_host, rootfs->buf, rootfs->dfd_dev, device->name);
}
(void)umask(cmask);
@@ -1496,104 +1457,48 @@ static int lxc_setup_devpts_child(struct lxc_handler *handler)
if (ret < 0 && errno != EEXIST)
return log_error_errno(-1, errno, "Failed to create \"/dev/pts\" directory");
- if (can_use_mount_api()) {
- fd_fs = fs_prepare("devpts", -EBADF, "", 0, 0);
- if (fd_fs < 0)
- return syserror("Failed to prepare filesystem context for devpts");
-
- ret = fs_set_property(fd_fs, "source", "devpts");
- if (ret < 0)
- SYSTRACE("Failed to set \"source=devpts\" on devpts filesystem context %d", fd_fs);
-
- ret = fs_set_property(fd_fs, "gid", "5");
- if (ret < 0)
- SYSTRACE("Failed to set \"gid=5\" on devpts filesystem context %d", fd_fs);
-
- ret = fs_set_flag(fd_fs, "newinstance");
- if (ret < 0)
- return syserror("Failed to set \"newinstance\" property on devpts filesystem context %d", fd_fs);
-
- ret = fs_set_property(fd_fs, "ptmxmode", "0666");
- if (ret < 0)
- return syserror("Failed to set \"ptmxmode=0666\" property on devpts filesystem context %d", fd_fs);
-
- ret = fs_set_property(fd_fs, "mode", "0620");
- if (ret < 0)
- return syserror("Failed to set \"mode=0620\" property on devpts filesystem context %d", fd_fs);
+ fd_fs = fs_prepare("devpts", -EBADF, "", 0, 0);
+ if (fd_fs < 0)
+ return syserror("Failed to prepare filesystem context for devpts");
- ret = fs_set_property(fd_fs, "max", fdstr(pty_max));
- if (ret < 0)
- return syserror("Failed to set \"max=%zu\" property on devpts filesystem context %d", conf->pty_max, fd_fs);
-
- ret = fsconfig(fd_fs, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
- if (ret < 0)
- return syserror("Failed to finalize filesystem context %d", fd_fs);
-
- devpts_fd = fsmount(fd_fs, FSMOUNT_CLOEXEC, MOUNT_ATTR_NOSUID | MOUNT_ATTR_NOEXEC);
- if (devpts_fd < 0)
- return syserror("Failed to create new mount for filesystem context %d", fd_fs);
- TRACE("Created detached devpts mount %d", devpts_fd);
-
- ret = move_mount(devpts_fd, "", rootfs->dfd_dev, "pts", MOVE_MOUNT_F_EMPTY_PATH);
- if (ret)
- return syserror("Failed to attach devpts mount %d to %d/pts", conf->devpts_fd, rootfs->dfd_dev);
-
- DEBUG("Attached detached devpts mount %d to %d/pts", devpts_fd, rootfs->dfd_dev);
- } else {
- char **opts;
- char devpts_mntopts[256];
- char *mntopt_sets[5];
- char default_devpts_mntopts[256] = "gid=5,newinstance,ptmxmode=0666,mode=0620";
-
- /*
- * Fallback codepath in case the new mount API can't be used to
- * create detached mounts.
- */
-
- ret = strnprintf(devpts_mntopts, sizeof(devpts_mntopts), "%s,max=%zu",
- default_devpts_mntopts, pty_max);
- if (ret < 0)
- return -1;
+ ret = fs_set_property(fd_fs, "source", "devpts");
+ if (ret < 0)
+ SYSTRACE("Failed to set \"source=devpts\" on devpts filesystem context %d", fd_fs);
- /* Create mountpoint for devpts instance. */
- ret = mkdirat(rootfs->dfd_dev, "pts", 0755);
- if (ret < 0 && errno != EEXIST)
- return log_error_errno(-1, errno, "Failed to create \"/dev/pts\" directory");
+ ret = fs_set_property(fd_fs, "gid", "5");
+ if (ret < 0)
+ SYSTRACE("Failed to set \"gid=5\" on devpts filesystem context %d", fd_fs);
- /* gid=5 && max= */
- mntopt_sets[0] = devpts_mntopts;
+ ret = fs_set_flag(fd_fs, "newinstance");
+ if (ret < 0)
+ return syserror("Failed to set \"newinstance\" property on devpts filesystem context %d", fd_fs);
- /* !gid=5 && max= */
- mntopt_sets[1] = devpts_mntopts + STRLITERALLEN("gid=5") + 1;
+ ret = fs_set_property(fd_fs, "ptmxmode", "0666");
+ if (ret < 0)
+ return syserror("Failed to set \"ptmxmode=0666\" property on devpts filesystem context %d", fd_fs);
- /* gid=5 && !max= */
- mntopt_sets[2] = default_devpts_mntopts;
+ ret = fs_set_property(fd_fs, "mode", "0620");
+ if (ret < 0)
+ return syserror("Failed to set \"mode=0620\" property on devpts filesystem context %d", fd_fs);
- /* !gid=5 && !max= */
- mntopt_sets[3] = default_devpts_mntopts + STRLITERALLEN("gid=5") + 1;
+ ret = fs_set_property(fd_fs, "max", fdstr(pty_max));
+ if (ret < 0)
+ return syserror("Failed to set \"max=%zu\" property on devpts filesystem context %d", conf->pty_max, fd_fs);
- /* end */
- mntopt_sets[4] = NULL;
+ ret = fsconfig(fd_fs, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
+ if (ret < 0)
+ return syserror("Failed to finalize filesystem context %d", fd_fs);
- for (ret = -1, opts = mntopt_sets; opts && *opts; opts++) {
- /* mount new devpts instance */
- ret = mount_at(rootfs->dfd_dev, "", 0,
- rootfs->dfd_dev, "pts", PROTECT_LOOKUP_BENEATH,
- "devpts", MS_NOSUID | MS_NOEXEC, *opts);
- if (ret == 0)
- break;
- }
- if (ret < 0)
- return log_error_errno(-1, errno, "Failed to mount new devpts instance");
+ devpts_fd = fsmount(fd_fs, FSMOUNT_CLOEXEC, MOUNT_ATTR_NOSUID | MOUNT_ATTR_NOEXEC);
+ if (devpts_fd < 0)
+ return syserror("Failed to create new mount for filesystem context %d", fd_fs);
+ TRACE("Created detached devpts mount %d", devpts_fd);
- devpts_fd = open_at(rootfs->dfd_dev, "pts", PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
- if (devpts_fd < 0) {
- devpts_fd = -EBADF;
- TRACE("Failed to create detached devpts mount");
- }
+ ret = move_mount(devpts_fd, "", rootfs->dfd_dev, "pts", MOVE_MOUNT_F_EMPTY_PATH);
+ if (ret)
+ return syserror("Failed to attach devpts mount %d to %d/pts", conf->devpts_fd, rootfs->dfd_dev);
- DEBUG("Mounted new devpts instance with options \"%s\"", *opts);
- }
+ DEBUG("Attached detached devpts mount %d to %d/pts", devpts_fd, rootfs->dfd_dev);
handler->conf->devpts_fd = move_fd(devpts_fd);
@@ -1739,10 +1644,7 @@ static int bind_mount_console(int fd_devpts, struct lxc_rootfs *rootfs,
* Note, there are intentionally no open or lookup restrictions since
* we're operating directly on the fd.
*/
- if (can_use_mount_api())
- return fd_bind_mount(fd_pty, "", 0, 0, fd_to, "", 0, 0, 0, 0, 0, false);
-
- return mount_fd(fd_pty, fd_to, "none", MS_BIND, 0);
+ return fd_bind_mount(fd_pty, "", 0, 0, fd_to, "", 0, 0, 0, 0, 0, false);
}
static int lxc_setup_dev_console(int fd_devpts, struct lxc_rootfs *rootfs,
@@ -1871,21 +1773,18 @@ static int lxc_setup_ttydir_console(int fd_devpts, struct lxc_rootfs *rootfs,
return syserror("Failed to open \"%d/console\"", fd_ttydir);
/* bind mount '/dev//console' to '/dev/console' */
- if (can_use_mount_api())
- ret = fd_bind_mount(fd_dev_console,
- "",
- PROTECT_OPATH_FILE,
- PROTECT_LOOKUP_BENEATH_XDEV,
- fd_reg_console,
- "",
- PROTECT_OPATH_FILE,
- PROTECT_LOOKUP_BENEATH,
- 0,
- 0,
- 0,
- false);
- else
- ret = mount_fd(fd_dev_console, fd_reg_console, "none", MS_BIND, 0);
+ ret = fd_bind_mount(fd_dev_console,
+ "",
+ PROTECT_OPATH_FILE,
+ PROTECT_LOOKUP_BENEATH_XDEV,
+ fd_reg_console,
+ "",
+ PROTECT_OPATH_FILE,
+ PROTECT_LOOKUP_BENEATH,
+ 0,
+ 0,
+ 0,
+ false);
if (ret < 0)
return syserror("Failed to mount \"%d\" on \"%d\"",
fd_dev_console, fd_reg_console);
@@ -3208,7 +3107,6 @@ struct lxc_conf *lxc_conf_init(void)
new->rootfs.fd_path_pin = -EBADF;
new->rootfs.dfd_idmapped = -EBADF;
new->logfd = -1;
- INIT_LIST_HEAD(&new->cgroup);
INIT_LIST_HEAD(&new->cgroup2);
/* Block ("allowlist") all devices by default. */
new->bpf_devices.list_type = LXC_BPF_DEVICE_CGROUP_ALLOWLIST;
@@ -4118,11 +4016,6 @@ int lxc_clear_cgroups(struct lxc_conf *c, const char *key, int version)
namespaced_token = "lxc.cgroup2.";
namespaced_token_len = STRLITERALLEN("lxc.cgroup2.");
list = &c->cgroup2;
- } else if (version == CGROUP_SUPER_MAGIC) {
- global_token = "lxc.cgroup";
- namespaced_token = "lxc.cgroup.";
- namespaced_token_len = STRLITERALLEN("lxc.cgroup.");
- list = &c->cgroup;
} else {
return ret_errno(EINVAL);
}
@@ -4370,7 +4263,6 @@ void lxc_conf_free(struct lxc_conf *conf)
free(conf->lsm_se_keyring_context);
lxc_seccomp_free(&conf->seccomp);
lxc_clear_config_caps(conf);
- lxc_clear_cgroups(conf, "lxc.cgroup", CGROUP_SUPER_MAGIC);
lxc_clear_cgroups(conf, "lxc.cgroup2", CGROUP2_SUPER_MAGIC);
lxc_clear_cgroups_devices(conf);
lxc_clear_hooks(conf, "lxc.hook");
diff --git a/src/lxc/conf.h b/src/lxc/conf.h
index 762d58901d..ea4e199404 100644
--- a/src/lxc/conf.h
+++ b/src/lxc/conf.h
@@ -417,7 +417,6 @@ struct lxc_conf {
struct utsname *utsname;
struct {
- struct list_head cgroup;
struct list_head cgroup2;
struct bpf_devices bpf_devices;
};
diff --git a/src/lxc/confile.c b/src/lxc/confile.c
index 8985f15b79..3d310a9b70 100644
--- a/src/lxc/confile.c
+++ b/src/lxc/confile.c
@@ -65,7 +65,6 @@ lxc_config_define(apparmor_profile);
lxc_config_define(apparmor_raw);
lxc_config_define(cap_drop);
lxc_config_define(cap_keep);
-lxc_config_define(cgroup_controller);
lxc_config_define(cgroup2_controller);
lxc_config_define(cgroup_dir);
lxc_config_define(cgroup_monitor_dir);
@@ -206,7 +205,6 @@ static struct lxc_config_t config_jump_table[] = {
{ "lxc.cgroup.dir.container", true, set_config_cgroup_container_dir, get_config_cgroup_container_dir, clr_config_cgroup_container_dir, },
{ "lxc.cgroup.dir", true, set_config_cgroup_dir, get_config_cgroup_dir, clr_config_cgroup_dir, },
{ "lxc.cgroup.relative", true, set_config_cgroup_relative, get_config_cgroup_relative, clr_config_cgroup_relative, },
- { "lxc.cgroup", false, set_config_cgroup_controller, get_config_cgroup_controller, clr_config_cgroup_controller, },
{ "lxc.console.buffer.size", true, set_config_console_buffer_size, get_config_console_buffer_size, clr_config_console_buffer_size, },
{ "lxc.console.logfile", true, set_config_console_logfile, get_config_console_logfile, clr_config_console_logfile, },
{ "lxc.console.path", true, set_config_console_path, get_config_console_path, clr_config_console_path, },
@@ -1934,9 +1932,6 @@ static int __set_config_cgroup_controller(const char *key, const char *value,
if (version == CGROUP2_SUPER_MAGIC) {
token = "lxc.cgroup2.";
token_len = 12;
- } else if (version == CGROUP_SUPER_MAGIC) {
- token = "lxc.cgroup.";
- token_len = 11;
} else {
return ret_errno(EINVAL);
}
@@ -1962,22 +1957,12 @@ static int __set_config_cgroup_controller(const char *key, const char *value,
new_cgroup->version = version;
- if (version == CGROUP2_SUPER_MAGIC)
- list_add_tail(&new_cgroup->head, &lxc_conf->cgroup2);
- else
- list_add_tail(&new_cgroup->head, &lxc_conf->cgroup);
+ list_add_tail(&new_cgroup->head, &lxc_conf->cgroup2);
move_ptr(new_cgroup);
return 0;
}
-static int set_config_cgroup_controller(const char *key, const char *value,
- struct lxc_conf *lxc_conf, void *data)
-{
- return __set_config_cgroup_controller(key, value, lxc_conf,
- CGROUP_SUPER_MAGIC);
-}
-
static int set_config_cgroup2_controller(const char *key, const char *value,
struct lxc_conf *lxc_conf, void *data)
{
@@ -3903,11 +3888,6 @@ static int __get_config_cgroup_controller(const char *key, char *retv,
namespaced_token = "lxc.cgroup2.";
namespaced_token_len = STRLITERALLEN("lxc.cgroup2.");
list = &c->cgroup2;
- } else if (version == CGROUP_SUPER_MAGIC) {
- global_token = "lxc.cgroup";
- namespaced_token = "lxc.cgroup.";
- namespaced_token_len = STRLITERALLEN("lxc.cgroup.");
- list = &c->cgroup;
} else {
return ret_errno(EINVAL);
}
@@ -3934,13 +3914,6 @@ static int __get_config_cgroup_controller(const char *key, char *retv,
return fulllen;
}
-static int get_config_cgroup_controller(const char *key, char *retv, int inlen,
- struct lxc_conf *c, void *data)
-{
- return __get_config_cgroup_controller(key, retv, inlen, c,
- CGROUP_SUPER_MAGIC);
-}
-
static int get_config_cgroup2_controller(const char *key, char *retv, int inlen,
struct lxc_conf *c, void *data)
{
@@ -4931,12 +4904,6 @@ static inline int clr_config_keyring_session(const char *key,
return 0;
}
-static inline int clr_config_cgroup_controller(const char *key,
- struct lxc_conf *c, void *data)
-{
- return lxc_clear_cgroups(c, key, CGROUP_SUPER_MAGIC);
-}
-
static inline int clr_config_cgroup2_controller(const char *key,
struct lxc_conf *c, void *data)
{
diff --git a/src/lxc/lsm/apparmor.c b/src/lxc/lsm/apparmor.c
index 9f31840ff7..256f8c12cb 100644
--- a/src/lxc/lsm/apparmor.c
+++ b/src/lxc/lsm/apparmor.c
@@ -84,9 +84,23 @@ static const char AA_PROFILE_BASE[] =
" # deny access under /proc/bus to avoid e.g. messing with pci devices directly\n"
" deny @{PROC}/bus/** wklx,\n"
"\n"
-" # deny writes in /proc/sys/fs but allow binfmt_misc to be mounted\n"
+" # allow binfmt_misc to be mounted\n"
" mount fstype=binfmt_misc -> /proc/sys/fs/binfmt_misc/,\n"
-" deny @{PROC}/sys/fs/** wklx,\n"
+"\n"
+" # deny writes in /proc/sys/fs except /proc/sys/fs/binfmt_misc\n"
+" deny @{PROC}/sys/fs/[^b]*{,/**} wklx,\n"
+" deny @{PROC}/sys/fs/b[^i]*{,/**} wklx,\n"
+" deny @{PROC}/sys/fs/bi[^n]*{,/**} wklx,\n"
+" deny @{PROC}/sys/fs/bin[^f]*{,/**} wklx,\n"
+" deny @{PROC}/sys/fs/binf[^m]*{,/**} wklx,\n"
+" deny @{PROC}/sys/fs/binfm[^t]*{,/**} wklx,\n"
+" deny @{PROC}/sys/fs/binfmt[^_]*{,/**} wklx,\n"
+" deny @{PROC}/sys/fs/binfmt_[^m]*{,/**} wklx,\n"
+" deny @{PROC}/sys/fs/binfmt_m[^i]*{,/**} wklx,\n"
+" deny @{PROC}/sys/fs/binfmt_mi[^s]*{,/**} wklx,\n"
+" deny @{PROC}/sys/fs/binfmt_mis[^c]*{,/**} wklx,\n"
+" deny @{PROC}/sys/fs/binfmt_misc?*{,/**} wklx,\n"
+" deny @{PROC}/sys/fs?*{,/**} wklx,\n"
"\n"
" # allow efivars to be mounted, writing to it will be blocked though\n"
" mount fstype=efivarfs -> /sys/firmware/efi/efivars/,\n"
@@ -172,10 +186,28 @@ static const char AA_PROFILE_BASE[] =
" mount options=(rw,move) /sys?*{,/**},\n"
"\n";
+static const char AA_PROFILE_BASE_PRIVILEGED[] =
+" deny /proc/sys/fs/binfmt_misc/{,**} wklx,\n"
+"\n";
+
static const char AA_PROFILE_BASE_NO_NESTING[] =
"\n"
" # generated by: lxc-generate-aa-rules.py container-rules.base\n"
-" deny /proc/sys/[^kn]*{,/**} wklx,\n"
+" deny /proc/sys/[^fkn]*{,/**} wklx,\n"
+" deny /proc/sys/f[^s]*{,/**} wklx,\n"
+" deny /proc/sys/fs/[^b]*{,/**} wklx,\n"
+" deny /proc/sys/fs/b[^i]*{,/**} wklx,\n"
+" deny /proc/sys/fs/bi[^n]*{,/**} wklx,\n"
+" deny /proc/sys/fs/bin[^f]*{,/**} wklx,\n"
+" deny /proc/sys/fs/binf[^m]*{,/**} wklx,\n"
+" deny /proc/sys/fs/binfm[^t]*{,/**} wklx,\n"
+" deny /proc/sys/fs/binfmt[^_]*{,/**} wklx,\n"
+" deny /proc/sys/fs/binfmt_[^m]*{,/**} wklx,\n"
+" deny /proc/sys/fs/binfmt_m[^i]*{,/**} wklx,\n"
+" deny /proc/sys/fs/binfmt_mi[^s]*{,/**} wklx,\n"
+" deny /proc/sys/fs/binfmt_mis[^c]*{,/**} wklx,\n"
+" deny /proc/sys/fs/binfmt_misc?*{,/**} wklx,\n"
+" deny /proc/sys/fs?*{,/**} wklx,\n"
" deny /proc/sys/k[^e]*{,/**} wklx,\n"
" deny /proc/sys/ke[^r]*{,/**} wklx,\n"
" deny /proc/sys/ker[^n]*{,/**} wklx,\n"
@@ -338,6 +370,8 @@ static const char AA_PROFILE_UNPRIVILEGED[] =
" ### Configuration: unprivileged container\n"
" pivot_root,\n"
"\n"
+" mount fstype=binfmt_misc,\n"
+"\n"
" # Allow modifying mount propagation\n"
" mount options=(rw,make-slave) -> /{,**},\n"
" mount options=(rw,make-rslave) -> /{,**},\n"
@@ -689,6 +723,7 @@ static const struct mntopt_t {
{ ",nodev", sizeof(",nodev")-1 },
{ ",nosuid", sizeof(",nosuid")-1 },
{ ",noexec", sizeof(",noexec")-1 },
+ { ",nosymfollow", sizeof(",nosymfollow")-1 },
};
static void append_remount_rule(char **profile, size_t *size, const char *rule)
@@ -758,6 +793,10 @@ static char *get_apparmor_profile_content(struct lsm_ops *ops, struct lxc_conf *
must_append_sized(&profile, &size, AA_PROFILE_BASE,
STRARRAYLEN(AA_PROFILE_BASE));
+ if (is_privileged(conf))
+ must_append_sized(&profile, &size, AA_PROFILE_BASE_PRIVILEGED,
+ STRARRAYLEN(AA_PROFILE_BASE_PRIVILEGED));
+
if (!conf->lsm_aa_allow_nesting)
must_append_sized(&profile, &size, AA_PROFILE_BASE_NO_NESTING,
STRARRAYLEN(AA_PROFILE_BASE_NO_NESTING));
diff --git a/src/lxc/start.c b/src/lxc/start.c
index c64e78e93f..ddec8f7624 100644
--- a/src/lxc/start.c
+++ b/src/lxc/start.c
@@ -1774,8 +1774,17 @@ static inline int do_share_ns(void *arg)
flags |= CLONE_PARENT;
handler->pid = lxc_raw_clone_cb(do_start, handler, CLONE_PIDFD | flags,
&handler->pidfd);
- if (handler->pid < 0)
+ if (handler->pid < 0) {
+ ERROR("Failed to clone process");
return -1;
+ }
+
+ if (handler->pidfd < 0) {
+ kill(handler->pid, SIGKILL);
+ handler->pid = -1;
+ ERROR("CLONE_PIDFD isn't supported");
+ return -1;
+ }
return 0;
}
@@ -1920,7 +1929,7 @@ static int lxc_spawn(struct lxc_handler *handler)
/* Try to spawn directly into target cgroup. */
handler->pid = lxc_clone3(&clone_args, CLONE_ARGS_SIZE_VER2);
if (handler->pid < 0) {
- SYSTRACE("Failed to spawn container directly into target cgroup");
+ SYSWARN("Failed to spawn container directly into target cgroup");
/* Kernel might simply be too old for CLONE_INTO_CGROUP. */
resolve_cgroup_clone_flags(handler);
@@ -1931,31 +1940,6 @@ static int lxc_spawn(struct lxc_handler *handler)
TRACE("Spawned container directly into target cgroup via cgroup2 fd %d", cgroup_fd);
}
- /* Kernel might be too old for clone3(). */
- if (handler->pid < 0) {
- SYSTRACE("Failed to spawn container via clone3()");
-
- /*
- * In contrast to all other architectures arm64 verifies that
- * the argument we use to retrieve the pidfd with is
- * initialized to 0. But we need to be able to initialize it to
- * a negative value such as our customary -EBADF so we can
- * detect whether this kernel supports pidfds. If the syscall
- * returns and the pidfd variable is set to something >= 0 then
- * we know this is a kernel supporting pidfds. But if we can't
- * set it to -EBADF then this won't work since 0 is a valid
- * file descriptor too. And since legacy clone silently ignores
- * unknown flags we are left without any way to detect support
- * for pidfds. So let's special-case arm64 to not fail starting
- * containers.
- */
- #if defined(__aarch64__)
- handler->pid = lxc_raw_legacy_clone(handler->clone_flags & ~CLONE_PIDFD, NULL);
- #else
- handler->pid = lxc_raw_legacy_clone(handler->clone_flags, &handler->pidfd);
- #endif
- }
-
if (handler->pid < 0) {
SYSERROR(LXC_CLONE_ERROR);
goto out_delete_net;
@@ -1975,8 +1959,10 @@ static int lxc_spawn(struct lxc_handler *handler)
goto out_delete_net;
/* Verify that we can actually make use of pidfds. */
- if (!lxc_can_use_pidfd(handler->pidfd))
+ if (!lxc_can_use_pidfd(handler->pidfd)) {
close_prot_errno_disarm(handler->pidfd);
+ goto out_delete_net;
+ }
ret = strnprintf(pidstr, 20, "%d", handler->pid);
if (ret < 0)
@@ -2020,11 +2006,6 @@ static int lxc_spawn(struct lxc_handler *handler)
}
}
- if (!cgroup_ops->setup_limits_legacy(cgroup_ops, handler->conf, false)) {
- ERROR("Failed to setup cgroup limits for container \"%s\"", name);
- goto out_delete_net;
- }
-
if (!cgroup_ops->payload_delegate_controllers(cgroup_ops)) {
ERROR("Failed to delegate controllers to payload cgroup");
goto out_delete_net;
@@ -2113,17 +2094,6 @@ static int lxc_spawn(struct lxc_handler *handler)
if (!lxc_sync_wait_child(handler, START_SYNC_CGROUP_LIMITS))
goto out_delete_net;
- /*
- * With isolation the limiting devices cgroup was already setup, so
- * only setup devices here if we have no namespace directory.
- */
- if (!handler->conf->cgroup_meta.namespace_dir &&
- !cgroup_ops->setup_limits_legacy(cgroup_ops, handler->conf, true)) {
- ERROR("Failed to setup legacy device cgroup controller limits");
- goto out_delete_net;
- }
- TRACE("Set up legacy device cgroup controller limits");
-
if (!cgroup_ops->devices_activate(cgroup_ops, handler)) {
ERROR("Failed to setup cgroup2 device controller limits");
goto out_delete_net;
diff --git a/src/lxc/storage/lvm.c b/src/lxc/storage/lvm.c
index d563144518..dbc1ccf9fa 100644
--- a/src/lxc/storage/lvm.c
+++ b/src/lxc/storage/lvm.c
@@ -137,6 +137,7 @@ static int do_lvm_create(const char *path, uint64_t size, const char *thinpool)
return log_error(-EINVAL, "Failed to detect whether \"%s\" is a thinpool", tp);
} else if (!ret) {
TRACE("Detected that \"%s\" is not a thinpool", tp);
+ free(tp);
tp = NULL;
} else {
TRACE("Detected \"%s\" is a thinpool", tp);
diff --git a/src/lxc/storage/storage.c b/src/lxc/storage/storage.c
index 497cad7882..2490d3c333 100644
--- a/src/lxc/storage/storage.c
+++ b/src/lxc/storage/storage.c
@@ -516,6 +516,8 @@ struct lxc_storage *storage_copy(struct lxc_container *c, const char *cname,
}
on_success:
+ /* The only caller, copy_storage, doesn't ever close this. */
+ close_prot_errno_disarm(new_rootfs.dfd_idmapped);
lxc_storage_put(c->lxc_conf);
return new;
@@ -524,6 +526,7 @@ struct lxc_storage *storage_copy(struct lxc_container *c, const char *cname,
storage_put(new);
on_error_put_orig:
+ close_prot_errno_disarm(new_rootfs.dfd_idmapped);
lxc_storage_put(c->lxc_conf);
return NULL;
diff --git a/src/tests/get_item.c b/src/tests/get_item.c
index 40cc564adf..52559c1b2b 100644
--- a/src/tests/get_item.c
+++ b/src/tests/get_item.c
@@ -622,22 +622,22 @@ int main(int argc, char *argv[])
goto out;
}
- ret = c->get_config_item(c, "lxc.cgroup", v3, 2047);
+ ret = c->get_config_item(c, "lxc.cgroup2", v3, 2047);
if (ret < 0) {
- fprintf(stderr, "%d: get_config_item(cgroup.devices) returned %d\n", __LINE__, ret);
+ fprintf(stderr, "%d: get_config_item(cgroup2.devices) returned %d\n", __LINE__, ret);
goto out;
}
- printf("%d: get_config_item (cgroup.devices) returned %d %s\n", __LINE__, ret, v3);
+ printf("%d: get_config_item (cgroup2.devices) returned %d %s\n", __LINE__, ret, v3);
- ret = c->get_config_item(c, "lxc.cgroup.devices.allow", v3, 2047);
+ ret = c->get_config_item(c, "lxc.cgroup2.devices.allow", v3, 2047);
if (ret < 0) {
- fprintf(stderr, "%d: get_config_item(cgroup.devices.devices.allow) returned %d\n", __LINE__, ret);
+ fprintf(stderr, "%d: get_config_item(cgroup2.devices.devices.allow) returned %d\n", __LINE__, ret);
goto out;
}
- printf("%d: get_config_item (cgroup.devices.devices.allow) returned %d %s\n", __LINE__, ret, v3);
+ printf("%d: get_config_item (cgroup2.devices.devices.allow) returned %d %s\n", __LINE__, ret, v3);
- if (!c->clear_config_item(c, "lxc.cgroup")) {
- fprintf(stderr, "%d: failed clearing lxc.cgroup\n", __LINE__);
+ if (!c->clear_config_item(c, "lxc.cgroup2")) {
+ fprintf(stderr, "%d: failed clearing lxc.cgroup2\n", __LINE__);
goto out;
}
diff --git a/src/tests/lxc-test-checkpoint-restore b/src/tests/lxc-test-checkpoint-restore
index 21f9b7c652..3a2853d644 100755
--- a/src/tests/lxc-test-checkpoint-restore
+++ b/src/tests/lxc-test-checkpoint-restore
@@ -42,7 +42,7 @@ cat >> "$(lxc-config lxc.lxcpath)/$name/config" </dev/null 2>&1 || FAIL "missing busybox's tee applet"
out=$(mktemp /tmp/out_XXXX)
BS=1000000
-( sleep 3; echo "echo DATASTART ; dd if=/dev/urandom bs=$BS count=1 status=none | hexdump | tee /root/large-data.txt ; echo DATAEND" ; sleep 1 ) | \
+( sleep 3; echo "echo DATASTART ; dd if=/dev/urandom bs=$BS count=1 status=none | hexdump | tee /root/large-data.txt ; echo DATAEND" ; sleep 3 ) | \
script -q -e -c "lxc-attach -n busy -l trace -o \"${ATTACH_LOG}\"" | \
sed -n '/DATASTART/,/DATAEND/{/DATASTART/d;/DATAEND/d;s/[\r\n]*$//;p}' > $out
diff --git a/src/tests/lxc-test-usernic-2.in b/src/tests/lxc-test-usernic-2.in
new file mode 100755
index 0000000000..567ac6e5a2
--- /dev/null
+++ b/src/tests/lxc-test-usernic-2.in
@@ -0,0 +1,129 @@
+#!/bin/bash
+
+# SPDX-License-Identifier: LGPL-2.1+
+
+# lxc: linux Container library
+#
+# This tests verifies that lxc-user-nic can't be used by an
+# unprivileged user to delete another user's ovs nics.
+#
+# This test assumes an Ubuntu host
+
+DONE=0
+LXC_USER_NIC="@LIBEXECDIR@/lxc/lxc-user-nic"
+
+apt-get -y install openvswitch-switch
+
+run_cmd() {
+ u=$1
+ shift
+ sudo -i -u $u \
+ env http_proxy=${http_proxy:-} https_proxy=${https_proxy:-} \
+ XDG_RUNTIME_DIR=/run/user/$(id -u $u) ASAN_OPTIONS=${ASAN_OPTIONS:-} \
+ UBSAN_OPTIONS=${UBSAN_OPTIONS:-} $*
+}
+
+cleanup() {
+ set +e
+
+ (
+ run_cmd usernic-first "lxc-stop -n b1 -k"
+ run_cmd usernic-second "lxc-stop -n b1 -k"
+ sed -i '/usernic-first/d' /run/lxc/nics /etc/lxc/lxc-usernet
+ sed -i '/usernic-second/d' /run/lxc/nics /etc/lxc/lxc-usernet
+ ovs-vsctl del-br usernic-vs
+
+ pkill -u $(id -u usernic-first) -9
+ pkill -u $(id -u usernic-second) -9
+
+ rm -rf /tmp/usernic-test
+ rm -rf /home/usernic-first /run/user/$(id -u usernic-first)
+ rm -rf /home/usernic-second /run/user/$(id -u usernic-second)
+
+ deluser usernic-first
+ deluser usernic-second
+ ) >/dev/null 2>&1
+
+ if [ "$DONE" = "1" ]; then
+ echo "PASS"
+ exit 0
+ fi
+
+ echo "FAIL"
+ exit 1
+}
+
+set -eux
+trap cleanup EXIT SIGHUP SIGINT SIGTERM
+
+# create a test user
+deluser usernic-first || true
+useradd usernic-first
+mkdir -p /home/usernic-first
+chown usernic-first: /home/usernic-first
+usermod -v 910000-919999 -w 910000-919999 usernic-first
+
+mkdir -p /home/usernic-first/.config/lxc/
+cat > /home/usernic-first/.config/lxc/default.conf << EOF
+lxc.net.0.type = veth
+lxc.net.0.link = usernic-vs
+lxc.net.0.flags = up
+lxc.idmap = u 0 910000 10000
+lxc.idmap = g 0 910000 10000
+EOF
+
+deluser usernic-second || true
+useradd usernic-second
+mkdir -p /home/usernic-second
+chown usernic-second: /home/usernic-second
+usermod -v 920000-929999 -w 920000-929999 usernic-second
+
+mkdir -p /home/usernic-second/.config/lxc/
+cat > /home/usernic-second/.config/lxc/default.conf << EOF
+lxc.net.0.type = veth
+lxc.net.0.link = usernic-vs
+lxc.net.0.flags = up
+lxc.idmap = u 0 920000 10000
+lxc.idmap = g 0 920000 10000
+lxc.apparmor.profile = lxc-container-default-with-nesting
+EOF
+
+mkdir -p /run/user/$(id -u usernic-first) /run/user/$(id -u usernic-second)
+chown -R usernic-first: /run/user/$(id -u usernic-first) /home/usernic-first
+chown -R usernic-second: /run/user/$(id -u usernic-second) /home/usernic-second
+
+ovs-vsctl add-br usernic-vs
+
+# Give each a quota of one nic on this bridge
+touch /etc/lxc/lxc-usernet
+sed -i '/^usernic-first/d' /etc/lxc/lxc-usernet
+sed -i '/^usernic-second/d' /etc/lxc/lxc-usernet
+echo "usernic-second veth usernic-vs 1" >> /etc/lxc/lxc-usernet
+echo "usernic-first veth usernic-vs 1" >> /etc/lxc/lxc-usernet
+
+run_cmd usernic-first "lxc-create -t busybox -n b1"
+run_cmd usernic-first "lxc-start -n b1 -d"
+run_cmd usernic-first "lxc-wait -n b1 -s RUNNING"
+p1=$(run_cmd usernic-first "lxc-info -n b1 -p -H")
+
+run_cmd usernic-second "lxc-create -t busybox -n b1"
+run_cmd usernic-second "lxc-start -n b1 -d"
+run_cmd usernic-second "lxc-wait -n b1 -s RUNNING"
+p2=$(run_cmd usernic-second "lxc-info -n b1 -p -H")
+
+ovs-vsctl list-ports usernic-vs
+n1=$(ovs-vsctl list-ports usernic-vs | wc -l)
+if [[ $n1 -ne 2 ]]; then
+ echo "wrong number of nics"
+ cleanup 1
+fi
+
+dev=$(grep usernic-first /run/lxc/nics | cut -f 4 -d\ )
+if run_cmd usernic-second \
+ "$LXC_USER_NIC delete xx xx /proc/$p2/ns/net veth usernic-vs $dev"; then
+ echo "FAIL: unpriv user could unlink another user's ovs port"
+ cleanup 1
+fi
+
+echo "All tests passed"
+DONE=1
diff --git a/src/tests/meson.build b/src/tests/meson.build
index 2b997b73f5..5fc7147fce 100644
--- a/src/tests/meson.build
+++ b/src/tests/meson.build
@@ -401,6 +401,13 @@ if want_tests
input: 'lxc-test-usernic.in',
output: 'lxc-test-usernic')
+ test_programs += configure_file(
+ configuration: conf,
+ install: true,
+ install_dir: bindir,
+ input: 'lxc-test-usernic-2.in',
+ output: 'lxc-test-usernic-2')
+
test_programs += configure_file(
configuration: dummy_config_data,
install: true,
diff --git a/src/tests/parse_config_file.c b/src/tests/parse_config_file.c
index 6bdac3609b..fdc35ae3e8 100644
--- a/src/tests/parse_config_file.c
+++ b/src/tests/parse_config_file.c
@@ -391,19 +391,19 @@ int main(int argc, char *argv[])
goto non_test_error;
}
- if (set_get_compare_clear_save_load(c, "lxc.cgroup.cpuset.cpus",
+ if (set_get_compare_clear_save_load(c, "lxc.cgroup2.cpuset.cpus",
"1-100", tmpf, false) < 0) {
- lxc_error("%s\n", "lxc.cgroup.cpuset.cpus");
+ lxc_error("%s\n", "lxc.cgroup2.cpuset.cpus");
goto non_test_error;
}
- if (!c->set_config_item(c, "lxc.cgroup.cpuset.cpus", "1-100")) {
- lxc_error("%s\n", "failed to set config item \"lxc.cgroup.cpuset.cpus\" to \"1-100\"");
+ if (!c->set_config_item(c, "lxc.cgroup2.cpuset.cpus", "1-100")) {
+ lxc_error("%s\n", "failed to set config item \"lxc.cgroup2.cpuset.cpus\" to \"1-100\"");
return -1;
}
- if (!c->set_config_item(c, "lxc.cgroup.memory.limit_in_bytes", "123456789")) {
- lxc_error("%s\n", "failed to set config item \"lxc.cgroup.memory.limit_in_bytes\" to \"123456789\"");
+ if (!c->set_config_item(c, "lxc.cgroup2.memory.max", "123456789")) {
+ lxc_error("%s\n", "failed to set config item \"lxc.cgroup2.memory.max\" to \"123456789\"");
return -1;
}