diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index 0260ed053efd..558c3a739baf 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -149,6 +149,16 @@ during boot, before manual intervention is possible. To make testing and experimenting easier, the kernel parameter cgroup_no_v1= allows disabling controllers in v1 and make them always available in v2. +cgroup v2 currently supports the following mount options. + + nsdelegate + + Consider cgroup namespaces as delegation boundaries. This + option is system wide and can only be set on mount or modified + through remount from the init namespace. The mount option is + ignored on non-init namespace mounts. Please refer to the + Delegation section for details. + 2-2. Organizing Processes @@ -308,19 +318,27 @@ file. 2-5-1. Model of Delegation -A cgroup can be delegated to a less privileged user by granting write -access of the directory and its "cgroup.procs" and -"cgroup.subtree_control" files to the user. Note that resource -control interface files in a given directory control the distribution -of the parent's resources and thus must not be delegated along with -the directory. +A cgroup can be delegated in two ways. First, to a less privileged +user by granting write access of the directory and its "cgroup.procs" +and "cgroup.subtree_control" files to the user. Second, if the +"nsdelegate" mount option is set, automatically to a cgroup namespace +on namespace creation. -Once delegated, the user can build sub-hierarchy under the directory, -organize processes as it sees fit and further distribute the resources -it received from the parent. The limits and other settings of all -resource controllers are hierarchical and regardless of what happens -in the delegated sub-hierarchy, nothing can escape the resource -restrictions imposed by the parent. +Because the resource control interface files in a given directory +control the distribution of the parent's resources, the delegatee +shouldn't be allowed to write to them. For the first method, this is +achieved by not granting access to these files. For the second, the +kernel rejects writes to all files other than "cgroup.procs" and +"cgroup.subtree_control" on a namespace root from inside the +namespace. + +The end results are equivalent for both delegation types. Once +delegated, the user can build sub-hierarchy under the directory, +organize processes inside it as it sees fit and further distribute the +resources it received from the parent. The limits and other settings +of all resource controllers are hierarchical and regardless of what +happens in the delegated sub-hierarchy, nothing can escape the +resource restrictions imposed by the parent. Currently, cgroup doesn't impose any restrictions on the number of cgroups in or nesting depth of a delegated sub-hierarchy; however, @@ -330,10 +348,12 @@ this may be limited explicitly in the future. 2-5-2. Delegation Containment A delegated sub-hierarchy is contained in the sense that processes -can't be moved into or out of the sub-hierarchy by the delegatee. For -a process with a non-root euid to migrate a target process into a -cgroup by writing its PID to the "cgroup.procs" file, the following -conditions must be met. +can't be moved into or out of the sub-hierarchy by the delegatee. + +For delegations to a less privileged user, this is achieved by +requiring the following conditions for a process with a non-root euid +to migrate a target process into a cgroup by writing its PID to the +"cgroup.procs" file. - The writer must have write access to the "cgroup.procs" file. @@ -360,6 +380,11 @@ destination cgroup C00 is above the points of delegation and U0 would not have write access to its "cgroup.procs" files and thus the write will be denied with -EACCES. +For delegations to namespaces, containment is achieved by requiring +that both the source and destination cgroups are reachable from the +namespace of the process which is attempting the migration. If either +is not reachable, the migration is rejected with -ENOENT. + 2-6. Guidelines @@ -1414,7 +1439,7 @@ D. Deprecated v1 Core Features - Multiple hierarchies including named ones are not supported. -- All mount options and remounting are not supported. +- All v1 mount options are not supported. - The "tasks" file is removed and "cgroup.procs" is not sorted. diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 3bc4196bf217..09f4c7df1478 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -67,12 +67,21 @@ enum { enum { CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ + + /* + * Consider namespaces as delegation boundaries. If this flag is + * set, controller specific interface files in a namespace root + * aren't writeable from inside the namespace. + */ + CGRP_ROOT_NS_DELEGATE = (1 << 3), }; /* cftype->flags */ enum { CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ + CFTYPE_NS_DELEGATABLE = (1 << 2), /* writeable beyond delegation boundaries */ + CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d48069ee84c2..620794a20a33 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1547,10 +1547,56 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, return len; } +static int parse_cgroup_root_flags(char *data, unsigned int *root_flags) +{ + char *token; + + *root_flags = 0; + + if (!data) + return 0; + + while ((token = strsep(&data, ",")) != NULL) { + if (!strcmp(token, "nsdelegate")) { + *root_flags |= CGRP_ROOT_NS_DELEGATE; + continue; + } + + pr_err("cgroup2: unknown option \"%s\"\n", token); + return -EINVAL; + } + + return 0; +} + +static void apply_cgroup_root_flags(unsigned int root_flags) +{ + if (current->nsproxy->cgroup_ns == &init_cgroup_ns) { + if (root_flags & CGRP_ROOT_NS_DELEGATE) + cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE; + else + cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; + } +} + +static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root) +{ + if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) + seq_puts(seq, ",nsdelegate"); + return 0; +} + static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) { - pr_err("remount is not allowed\n"); - return -EINVAL; + unsigned int root_flags; + int ret; + + ret = parse_cgroup_root_flags(data, &root_flags); + if (ret) + return ret; + + apply_cgroup_root_flags(root_flags); + return 0; } /* @@ -1790,6 +1836,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, { struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct dentry *dentry; + int ret; get_cgroup_ns(ns); @@ -1807,16 +1854,21 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, cgroup_enable_task_cg_lists(); if (fs_type == &cgroup2_fs_type) { - if (data) { - pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); + unsigned int root_flags; + + ret = parse_cgroup_root_flags(data, &root_flags); + if (ret) { put_cgroup_ns(ns); - return ERR_PTR(-EINVAL); + return ERR_PTR(ret); } + cgrp_dfl_visible = true; cgroup_get_live(&cgrp_dfl_root.cgrp); dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, CGROUP2_SUPER_MAGIC, ns); + if (!IS_ERR(dentry)) + apply_cgroup_root_flags(root_flags); } else { dentry = cgroup1_mount(&cgroup_fs_type, flags, data, CGROUP_SUPER_MAGIC, ns); @@ -2364,6 +2416,8 @@ static int cgroup_procs_write_permission(struct task_struct *task, struct kernfs_open_file *of) { struct super_block *sb = of->file->f_path.dentry->d_sb; + struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; + struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp; struct cgroup *src_cgrp, *com_cgrp; struct inode *inode; int ret; @@ -2407,6 +2461,15 @@ static int cgroup_procs_write_permission(struct task_struct *task, if (ret) return ret; + /* + * If namespaces are delegation boundaries, %current must be able + * to see both source and destination cgroups from its namespace. + */ + if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) && + (!cgroup_is_descendant(src_cgrp, root_cgrp) || + !cgroup_is_descendant(dst_cgrp, root_cgrp))) + return -ENOENT; + return 0; } @@ -2971,11 +3034,23 @@ static void cgroup_file_release(struct kernfs_open_file *of) static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { + struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct cgroup *cgrp = of->kn->parent->priv; struct cftype *cft = of->kn->priv; struct cgroup_subsys_state *css; int ret; + /* + * If namespaces are delegation boundaries, disallow writes to + * files in an non-init namespace root from inside the namespace + * except for the files explicitly marked delegatable - + * cgroup.procs and cgroup.subtree_control. + */ + if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && + !(cft->flags & CFTYPE_NS_DELEGATABLE) && + ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp) + return -EPERM; + if (cft->write) return cft->write(of, buf, nbytes, off); @@ -3809,6 +3884,7 @@ static int cgroup_procs_show(struct seq_file *s, void *v) static struct cftype cgroup_base_files[] = { { .name = "cgroup.procs", + .flags = CFTYPE_NS_DELEGATABLE, .file_offset = offsetof(struct cgroup, procs_file), .release = cgroup_procs_release, .seq_start = cgroup_procs_start, @@ -3822,6 +3898,7 @@ static struct cftype cgroup_base_files[] = { }, { .name = "cgroup.subtree_control", + .flags = CFTYPE_NS_DELEGATABLE, .seq_show = cgroup_subtree_control_show, .write = cgroup_subtree_control_write, }, @@ -4410,6 +4487,7 @@ int cgroup_rmdir(struct kernfs_node *kn) } static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { + .show_options = cgroup_show_options, .remount_fs = cgroup_remount, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir,