343 lines
11 KiB
Diff
343 lines
11 KiB
Diff
|
From ab3f5faa6255a0eb4f832675507d9e295ca7e9ba Mon Sep 17 00:00:00 2001
|
||
|
From: Hugh Dickins <hughd@google.com>
|
||
|
Date: Thu, 06 Feb 2014 23:56:01 +0000
|
||
|
Subject: cgroup: use an ordered workqueue for cgroup destruction
|
||
|
|
||
|
Sometimes the cleanup after memcg hierarchy testing gets stuck in
|
||
|
mem_cgroup_reparent_charges(), unable to bring non-kmem usage down to 0.
|
||
|
|
||
|
There may turn out to be several causes, but a major cause is this: the
|
||
|
workitem to offline parent can get run before workitem to offline child;
|
||
|
parent's mem_cgroup_reparent_charges() circles around waiting for the
|
||
|
child's pages to be reparented to its lrus, but it's holding cgroup_mutex
|
||
|
which prevents the child from reaching its mem_cgroup_reparent_charges().
|
||
|
|
||
|
Just use an ordered workqueue for cgroup_destroy_wq.
|
||
|
|
||
|
tj: Committing as the temporary fix until the reverse dependency can
|
||
|
be removed from memcg. Comment updated accordingly.
|
||
|
|
||
|
Fixes: e5fca243abae ("cgroup: use a dedicated workqueue for cgroup destruction")
|
||
|
Suggested-by: Filipe Brandenburger <filbranden@google.com>
|
||
|
Signed-off-by: Hugh Dickins <hughd@google.com>
|
||
|
Cc: stable@vger.kernel.org # 3.10+
|
||
|
Signed-off-by: Tejun Heo <tj@kernel.org>
|
||
|
---
|
||
|
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
|
||
|
index e2f46ba..aa95591 100644
|
||
|
--- a/kernel/cgroup.c
|
||
|
+++ b/kernel/cgroup.c
|
||
|
@@ -4845,12 +4845,16 @@ static int __init cgroup_wq_init(void)
|
||
|
/*
|
||
|
* There isn't much point in executing destruction path in
|
||
|
* parallel. Good chunk is serialized with cgroup_mutex anyway.
|
||
|
- * Use 1 for @max_active.
|
||
|
+ *
|
||
|
+ * XXX: Must be ordered to make sure parent is offlined after
|
||
|
+ * children. The ordering requirement is for memcg where a
|
||
|
+ * parent's offline may wait for a child's leading to deadlock. In
|
||
|
+ * the long term, this should be fixed from memcg side.
|
||
|
*
|
||
|
* We would prefer to do this in cgroup_init() above, but that
|
||
|
* is called before init_workqueues(): so leave this until after.
|
||
|
*/
|
||
|
- cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
|
||
|
+ cgroup_destroy_wq = alloc_ordered_workqueue("cgroup_destroy", 0);
|
||
|
BUG_ON(!cgroup_destroy_wq);
|
||
|
|
||
|
/*
|
||
|
--
|
||
|
cgit v0.9.2
|
||
|
From eb46bf89696972b856a9adb6aebd5c7b65c266e4 Mon Sep 17 00:00:00 2001
|
||
|
From: Tejun Heo <tj@kernel.org>
|
||
|
Date: Sat, 08 Feb 2014 15:26:33 +0000
|
||
|
Subject: cgroup: fix error return value in cgroup_mount()
|
||
|
|
||
|
When cgroup_mount() fails to allocate an id for the root, it didn't
|
||
|
set ret before jumping to unlock_drop ending up returning 0 after a
|
||
|
failure. Fix it.
|
||
|
|
||
|
Signed-off-by: Tejun Heo <tj@kernel.org>
|
||
|
Acked-by: Li Zefan <lizefan@huawei.com>
|
||
|
Cc: stable@vger.kernel.org
|
||
|
---
|
||
|
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
|
||
|
index aa95591..793f371 100644
|
||
|
--- a/kernel/cgroup.c
|
||
|
+++ b/kernel/cgroup.c
|
||
|
@@ -1566,10 +1566,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
|
||
|
mutex_lock(&cgroup_mutex);
|
||
|
mutex_lock(&cgroup_root_mutex);
|
||
|
|
||
|
- root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp,
|
||
|
- 0, 1, GFP_KERNEL);
|
||
|
- if (root_cgrp->id < 0)
|
||
|
+ ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
|
||
|
+ if (ret < 0)
|
||
|
goto unlock_drop;
|
||
|
+ root_cgrp->id = ret;
|
||
|
|
||
|
/* Check for name clashes with existing mounts */
|
||
|
ret = -EBUSY;
|
||
|
--
|
||
|
cgit v0.9.2
|
||
|
From b58c89986a77a23658682a100eb15d8edb571ebb Mon Sep 17 00:00:00 2001
|
||
|
From: Tejun Heo <tj@kernel.org>
|
||
|
Date: Sat, 08 Feb 2014 15:26:33 +0000
|
||
|
Subject: cgroup: fix error return from cgroup_create()
|
||
|
|
||
|
cgroup_create() was returning 0 after allocation failures. Fix it.
|
||
|
|
||
|
Signed-off-by: Tejun Heo <tj@kernel.org>
|
||
|
Acked-by: Li Zefan <lizefan@huawei.com>
|
||
|
Cc: stable@vger.kernel.org
|
||
|
---
|
||
|
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
|
||
|
index 793f371..0eb7b86 100644
|
||
|
--- a/kernel/cgroup.c
|
||
|
+++ b/kernel/cgroup.c
|
||
|
@@ -4158,7 +4158,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
|
||
|
struct cgroup *cgrp;
|
||
|
struct cgroup_name *name;
|
||
|
struct cgroupfs_root *root = parent->root;
|
||
|
- int ssid, err = 0;
|
||
|
+ int ssid, err;
|
||
|
struct cgroup_subsys *ss;
|
||
|
struct super_block *sb = root->sb;
|
||
|
|
||
|
@@ -4168,8 +4168,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
|
||
|
return -ENOMEM;
|
||
|
|
||
|
name = cgroup_alloc_name(dentry);
|
||
|
- if (!name)
|
||
|
+ if (!name) {
|
||
|
+ err = -ENOMEM;
|
||
|
goto err_free_cgrp;
|
||
|
+ }
|
||
|
rcu_assign_pointer(cgrp->name, name);
|
||
|
|
||
|
/*
|
||
|
@@ -4177,8 +4179,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
|
||
|
* a half-baked cgroup.
|
||
|
*/
|
||
|
cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
|
||
|
- if (cgrp->id < 0)
|
||
|
+ if (cgrp->id < 0) {
|
||
|
+ err = -ENOMEM;
|
||
|
goto err_free_name;
|
||
|
+ }
|
||
|
|
||
|
/*
|
||
|
* Only live parents can have children. Note that the liveliness
|
||
|
--
|
||
|
cgit v0.9.2
|
||
|
From 48573a893303986e3b0b2974d6fb11f3d1bb7064 Mon Sep 17 00:00:00 2001
|
||
|
From: Tejun Heo <tj@kernel.org>
|
||
|
Date: Sat, 08 Feb 2014 15:26:34 +0000
|
||
|
Subject: cgroup: fix locking in cgroup_cfts_commit()
|
||
|
|
||
|
cgroup_cfts_commit() walks the cgroup hierarchy that the target
|
||
|
subsystem is attached to and tries to apply the file changes. Due to
|
||
|
the convolution with inode locking, it can't keep cgroup_mutex locked
|
||
|
while iterating. It currently holds only RCU read lock around the
|
||
|
actual iteration and then pins the found cgroup using dget().
|
||
|
|
||
|
Unfortunately, this is incorrect. Although the iteration does check
|
||
|
cgroup_is_dead() before invoking dget(), there's nothing which
|
||
|
prevents the dentry from going away inbetween. Note that this is
|
||
|
different from the usual css iterations where css_tryget() is used to
|
||
|
pin the css - css_tryget() tests whether the css can be pinned and
|
||
|
fails if not.
|
||
|
|
||
|
The problem can be solved by simply holding cgroup_mutex instead of
|
||
|
RCU read lock around the iteration, which actually reduces LOC.
|
||
|
|
||
|
Signed-off-by: Tejun Heo <tj@kernel.org>
|
||
|
Acked-by: Li Zefan <lizefan@huawei.com>
|
||
|
Cc: stable@vger.kernel.org
|
||
|
---
|
||
|
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
|
||
|
index 0eb7b86..3edf716 100644
|
||
|
--- a/kernel/cgroup.c
|
||
|
+++ b/kernel/cgroup.c
|
||
|
@@ -2763,10 +2763,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
|
||
|
*/
|
||
|
update_before = cgroup_serial_nr_next;
|
||
|
|
||
|
- mutex_unlock(&cgroup_mutex);
|
||
|
-
|
||
|
/* add/rm files for all cgroups created before */
|
||
|
- rcu_read_lock();
|
||
|
css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
|
||
|
struct cgroup *cgrp = css->cgroup;
|
||
|
|
||
|
@@ -2775,23 +2772,19 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
|
||
|
|
||
|
inode = cgrp->dentry->d_inode;
|
||
|
dget(cgrp->dentry);
|
||
|
- rcu_read_unlock();
|
||
|
-
|
||
|
dput(prev);
|
||
|
prev = cgrp->dentry;
|
||
|
|
||
|
+ mutex_unlock(&cgroup_mutex);
|
||
|
mutex_lock(&inode->i_mutex);
|
||
|
mutex_lock(&cgroup_mutex);
|
||
|
if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
|
||
|
ret = cgroup_addrm_files(cgrp, cfts, is_add);
|
||
|
- mutex_unlock(&cgroup_mutex);
|
||
|
mutex_unlock(&inode->i_mutex);
|
||
|
-
|
||
|
- rcu_read_lock();
|
||
|
if (ret)
|
||
|
break;
|
||
|
}
|
||
|
- rcu_read_unlock();
|
||
|
+ mutex_unlock(&cgroup_mutex);
|
||
|
dput(prev);
|
||
|
deactivate_super(sb);
|
||
|
return ret;
|
||
|
--
|
||
|
cgit v0.9.2
|
||
|
From 0ab02ca8f887908152d1a96db5130fc661d36a1e Mon Sep 17 00:00:00 2001
|
||
|
From: Li Zefan <lizefan@huawei.com>
|
||
|
Date: Tue, 11 Feb 2014 08:05:46 +0000
|
||
|
Subject: cgroup: protect modifications to cgroup_idr with cgroup_mutex
|
||
|
|
||
|
Setup cgroupfs like this:
|
||
|
# mount -t cgroup -o cpuacct xxx /cgroup
|
||
|
# mkdir /cgroup/sub1
|
||
|
# mkdir /cgroup/sub2
|
||
|
|
||
|
Then run these two commands:
|
||
|
# for ((; ;)) { mkdir /cgroup/sub1/tmp && rmdir /mnt/sub1/tmp; } &
|
||
|
# for ((; ;)) { mkdir /cgroup/sub2/tmp && rmdir /mnt/sub2/tmp; } &
|
||
|
|
||
|
After seconds you may see this warning:
|
||
|
|
||
|
------------[ cut here ]------------
|
||
|
WARNING: CPU: 1 PID: 25243 at lib/idr.c:527 sub_remove+0x87/0x1b0()
|
||
|
idr_remove called for id=6 which is not allocated.
|
||
|
...
|
||
|
Call Trace:
|
||
|
[<ffffffff8156063c>] dump_stack+0x7a/0x96
|
||
|
[<ffffffff810591ac>] warn_slowpath_common+0x8c/0xc0
|
||
|
[<ffffffff81059296>] warn_slowpath_fmt+0x46/0x50
|
||
|
[<ffffffff81300aa7>] sub_remove+0x87/0x1b0
|
||
|
[<ffffffff810f3f02>] ? css_killed_work_fn+0x32/0x1b0
|
||
|
[<ffffffff81300bf5>] idr_remove+0x25/0xd0
|
||
|
[<ffffffff810f2bab>] cgroup_destroy_css_killed+0x5b/0xc0
|
||
|
[<ffffffff810f4000>] css_killed_work_fn+0x130/0x1b0
|
||
|
[<ffffffff8107cdbc>] process_one_work+0x26c/0x550
|
||
|
[<ffffffff8107eefe>] worker_thread+0x12e/0x3b0
|
||
|
[<ffffffff81085f96>] kthread+0xe6/0xf0
|
||
|
[<ffffffff81570bac>] ret_from_fork+0x7c/0xb0
|
||
|
---[ end trace 2d1577ec10cf80d0 ]---
|
||
|
|
||
|
It's because allocating/removing cgroup ID is not properly synchronized.
|
||
|
|
||
|
The bug was introduced when we converted cgroup_ida to cgroup_idr.
|
||
|
While synchronization is already done inside ida_simple_{get,remove}(),
|
||
|
users are responsible for concurrent calls to idr_{alloc,remove}().
|
||
|
|
||
|
tj: Refreshed on top of b58c89986a77 ("cgroup: fix error return from
|
||
|
cgroup_create()").
|
||
|
|
||
|
Fixes: 4e96ee8e981b ("cgroup: convert cgroup_ida to cgroup_idr")
|
||
|
Cc: <stable@vger.kernel.org> #3.12+
|
||
|
Reported-by: Michal Hocko <mhocko@suse.cz>
|
||
|
Signed-off-by: Li Zefan <lizefan@huawei.com>
|
||
|
Signed-off-by: Tejun Heo <tj@kernel.org>
|
||
|
---
|
||
|
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
|
||
|
index 5c09759..9450f02 100644
|
||
|
--- a/include/linux/cgroup.h
|
||
|
+++ b/include/linux/cgroup.h
|
||
|
@@ -166,6 +166,8 @@ struct cgroup {
|
||
|
*
|
||
|
* The ID of the root cgroup is always 0, and a new cgroup
|
||
|
* will be assigned with a smallest available ID.
|
||
|
+ *
|
||
|
+ * Allocating/Removing ID must be protected by cgroup_mutex.
|
||
|
*/
|
||
|
int id;
|
||
|
|
||
|
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
|
||
|
index 3edf716..52719ce 100644
|
||
|
--- a/kernel/cgroup.c
|
||
|
+++ b/kernel/cgroup.c
|
||
|
@@ -886,7 +886,9 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
|
||
|
* per-subsystem and moved to css->id so that lookups are
|
||
|
* successful until the target css is released.
|
||
|
*/
|
||
|
+ mutex_lock(&cgroup_mutex);
|
||
|
idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
|
||
|
+ mutex_unlock(&cgroup_mutex);
|
||
|
cgrp->id = -1;
|
||
|
|
||
|
call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
|
||
|
@@ -4168,16 +4170,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
|
||
|
rcu_assign_pointer(cgrp->name, name);
|
||
|
|
||
|
/*
|
||
|
- * Temporarily set the pointer to NULL, so idr_find() won't return
|
||
|
- * a half-baked cgroup.
|
||
|
- */
|
||
|
- cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
|
||
|
- if (cgrp->id < 0) {
|
||
|
- err = -ENOMEM;
|
||
|
- goto err_free_name;
|
||
|
- }
|
||
|
-
|
||
|
- /*
|
||
|
* Only live parents can have children. Note that the liveliness
|
||
|
* check isn't strictly necessary because cgroup_mkdir() and
|
||
|
* cgroup_rmdir() are fully synchronized by i_mutex; however, do it
|
||
|
@@ -4186,7 +4178,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
|
||
|
*/
|
||
|
if (!cgroup_lock_live_group(parent)) {
|
||
|
err = -ENODEV;
|
||
|
- goto err_free_id;
|
||
|
+ goto err_free_name;
|
||
|
+ }
|
||
|
+
|
||
|
+ /*
|
||
|
+ * Temporarily set the pointer to NULL, so idr_find() won't return
|
||
|
+ * a half-baked cgroup.
|
||
|
+ */
|
||
|
+ cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
|
||
|
+ if (cgrp->id < 0) {
|
||
|
+ err = -ENOMEM;
|
||
|
+ goto err_unlock;
|
||
|
}
|
||
|
|
||
|
/* Grab a reference on the superblock so the hierarchy doesn't
|
||
|
@@ -4218,7 +4220,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
|
||
|
*/
|
||
|
err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
|
||
|
if (err < 0)
|
||
|
- goto err_unlock;
|
||
|
+ goto err_free_id;
|
||
|
lockdep_assert_held(&dentry->d_inode->i_mutex);
|
||
|
|
||
|
cgrp->serial_nr = cgroup_serial_nr_next++;
|
||
|
@@ -4254,12 +4256,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
|
||
|
|
||
|
return 0;
|
||
|
|
||
|
-err_unlock:
|
||
|
- mutex_unlock(&cgroup_mutex);
|
||
|
- /* Release the reference count that we took on the superblock */
|
||
|
- deactivate_super(sb);
|
||
|
err_free_id:
|
||
|
idr_remove(&root->cgroup_idr, cgrp->id);
|
||
|
+ /* Release the reference count that we took on the superblock */
|
||
|
+ deactivate_super(sb);
|
||
|
+err_unlock:
|
||
|
+ mutex_unlock(&cgroup_mutex);
|
||
|
err_free_name:
|
||
|
kfree(rcu_dereference_raw(cgrp->name));
|
||
|
err_free_cgrp:
|
||
|
--
|
||
|
cgit v0.9.2
|