87c31b39ab
Pull user namespace related fixes from Eric Biederman: "As these are bug fixes almost all of thes changes are marked for backporting to stable. The first change (implicitly adding MNT_NODEV on remount) addresses a regression that was created when security issues with unprivileged remount were closed. I go on to update the remount test to make it easy to detect if this issue reoccurs. Then there are a handful of mount and umount related fixes. Then half of the changes deal with the a recently discovered design bug in the permission checks of gid_map. Unix since the beginning has allowed setting group permissions on files to less than the user and other permissions (aka ---rwx---rwx). As the unix permission checks stop as soon as a group matches, and setgroups allows setting groups that can not later be dropped, results in a situtation where it is possible to legitimately use a group to assign fewer privileges to a process. Which means dropping a group can increase a processes privileges. The fix I have adopted is that gid_map is now no longer writable without privilege unless the new file /proc/self/setgroups has been set to permanently disable setgroups. The bulk of user namespace using applications even the applications using applications using user namespaces without privilege remain unaffected by this change. Unfortunately this ix breaks a couple user space applications, that were relying on the problematic behavior (one of which was tools/selftests/mount/unprivileged-remount-test.c). To hopefully prevent needing a regression fix on top of my security fix I rounded folks who work with the container implementations mostly like to be affected and encouraged them to test the changes. > So far nothing broke on my libvirt-lxc test bed. :-) > Tested with openSUSE 13.2 and libvirt 1.2.9. > Tested-by: Richard Weinberger <richard@nod.at> > Tested on Fedora20 with libvirt 1.2.11, works fine. > Tested-by: Chen Hanxiao <chenhanxiao@cn.fujitsu.com> > Ok, thanks - yes, unprivileged lxc is working fine with your kernels. > Just to be sure I was testing the right thing I also tested using > my unprivileged nsexec testcases, and they failed on setgroup/setgid > as now expected, and succeeded there without your patches. > Tested-by: Serge Hallyn <serge.hallyn@ubuntu.com> > I tested this with Sandstorm. It breaks as is and it works if I add > the setgroups thing. > Tested-by: Andy Lutomirski <luto@amacapital.net> # breaks things as designed :(" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: userns: Unbreak the unprivileged remount tests userns; Correct the comment in map_write userns: Allow setting gid_maps without privilege when setgroups is disabled userns: Add a knob to disable setgroups on a per user namespace basis userns: Rename id_map_mutex to userns_state_mutex userns: Only allow the creator of the userns unprivileged mappings userns: Check euid no fsuid when establishing an unprivileged uid mapping userns: Don't allow unprivileged creation of gid mappings userns: Don't allow setgroups until a gid mapping has been setablished userns: Document what the invariant required for safe unprivileged mappings. groups: Consolidate the setgroups permission checks mnt: Clear mnt_expire during pivot_root mnt: Carefully set CL_UNPRIVILEGED in clone_mnt mnt: Move the clear of MNT_LOCKED from copy_tree to it's callers. umount: Do not allow unmounting rootfs. umount: Disallow unprivileged mount force mnt: Update unprivileged remount test mnt: Implicitly add MNT_NODEV on remount when it was implicitly added by mount
229 lines
5.4 KiB
C
229 lines
5.4 KiB
C
/*
|
|
* The "user cache".
|
|
*
|
|
* (C) Copyright 1991-2000 Linus Torvalds
|
|
*
|
|
* We have a per-user structure to keep track of how many
|
|
* processes, files etc the user has claimed, in order to be
|
|
* able to have per-user limits for system resources.
|
|
*/
|
|
|
|
#include <linux/init.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/bitops.h>
|
|
#include <linux/key.h>
|
|
#include <linux/interrupt.h>
|
|
#include <linux/export.h>
|
|
#include <linux/user_namespace.h>
|
|
#include <linux/proc_ns.h>
|
|
|
|
/*
|
|
* userns count is 1 for root user, 1 for init_uts_ns,
|
|
* and 1 for... ?
|
|
*/
|
|
struct user_namespace init_user_ns = {
|
|
.uid_map = {
|
|
.nr_extents = 1,
|
|
.extent[0] = {
|
|
.first = 0,
|
|
.lower_first = 0,
|
|
.count = 4294967295U,
|
|
},
|
|
},
|
|
.gid_map = {
|
|
.nr_extents = 1,
|
|
.extent[0] = {
|
|
.first = 0,
|
|
.lower_first = 0,
|
|
.count = 4294967295U,
|
|
},
|
|
},
|
|
.projid_map = {
|
|
.nr_extents = 1,
|
|
.extent[0] = {
|
|
.first = 0,
|
|
.lower_first = 0,
|
|
.count = 4294967295U,
|
|
},
|
|
},
|
|
.count = ATOMIC_INIT(3),
|
|
.owner = GLOBAL_ROOT_UID,
|
|
.group = GLOBAL_ROOT_GID,
|
|
.ns.inum = PROC_USER_INIT_INO,
|
|
#ifdef CONFIG_USER_NS
|
|
.ns.ops = &userns_operations,
|
|
#endif
|
|
.flags = USERNS_INIT_FLAGS,
|
|
#ifdef CONFIG_PERSISTENT_KEYRINGS
|
|
.persistent_keyring_register_sem =
|
|
__RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem),
|
|
#endif
|
|
};
|
|
EXPORT_SYMBOL_GPL(init_user_ns);
|
|
|
|
/*
|
|
* UID task count cache, to get fast user lookup in "alloc_uid"
|
|
* when changing user ID's (ie setuid() and friends).
|
|
*/
|
|
|
|
#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 7)
|
|
#define UIDHASH_SZ (1 << UIDHASH_BITS)
|
|
#define UIDHASH_MASK (UIDHASH_SZ - 1)
|
|
#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
|
|
#define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid))))
|
|
|
|
static struct kmem_cache *uid_cachep;
|
|
struct hlist_head uidhash_table[UIDHASH_SZ];
|
|
|
|
/*
|
|
* The uidhash_lock is mostly taken from process context, but it is
|
|
* occasionally also taken from softirq/tasklet context, when
|
|
* task-structs get RCU-freed. Hence all locking must be softirq-safe.
|
|
* But free_uid() is also called with local interrupts disabled, and running
|
|
* local_bh_enable() with local interrupts disabled is an error - we'll run
|
|
* softirq callbacks, and they can unconditionally enable interrupts, and
|
|
* the caller of free_uid() didn't expect that..
|
|
*/
|
|
static DEFINE_SPINLOCK(uidhash_lock);
|
|
|
|
/* root_user.__count is 1, for init task cred */
|
|
struct user_struct root_user = {
|
|
.__count = ATOMIC_INIT(1),
|
|
.processes = ATOMIC_INIT(1),
|
|
.sigpending = ATOMIC_INIT(0),
|
|
.locked_shm = 0,
|
|
.uid = GLOBAL_ROOT_UID,
|
|
};
|
|
|
|
/*
|
|
* These routines must be called with the uidhash spinlock held!
|
|
*/
|
|
static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
|
|
{
|
|
hlist_add_head(&up->uidhash_node, hashent);
|
|
}
|
|
|
|
static void uid_hash_remove(struct user_struct *up)
|
|
{
|
|
hlist_del_init(&up->uidhash_node);
|
|
}
|
|
|
|
static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent)
|
|
{
|
|
struct user_struct *user;
|
|
|
|
hlist_for_each_entry(user, hashent, uidhash_node) {
|
|
if (uid_eq(user->uid, uid)) {
|
|
atomic_inc(&user->__count);
|
|
return user;
|
|
}
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/* IRQs are disabled and uidhash_lock is held upon function entry.
|
|
* IRQ state (as stored in flags) is restored and uidhash_lock released
|
|
* upon function exit.
|
|
*/
|
|
static void free_user(struct user_struct *up, unsigned long flags)
|
|
__releases(&uidhash_lock)
|
|
{
|
|
uid_hash_remove(up);
|
|
spin_unlock_irqrestore(&uidhash_lock, flags);
|
|
key_put(up->uid_keyring);
|
|
key_put(up->session_keyring);
|
|
kmem_cache_free(uid_cachep, up);
|
|
}
|
|
|
|
/*
|
|
* Locate the user_struct for the passed UID. If found, take a ref on it. The
|
|
* caller must undo that ref with free_uid().
|
|
*
|
|
* If the user_struct could not be found, return NULL.
|
|
*/
|
|
struct user_struct *find_user(kuid_t uid)
|
|
{
|
|
struct user_struct *ret;
|
|
unsigned long flags;
|
|
|
|
spin_lock_irqsave(&uidhash_lock, flags);
|
|
ret = uid_hash_find(uid, uidhashentry(uid));
|
|
spin_unlock_irqrestore(&uidhash_lock, flags);
|
|
return ret;
|
|
}
|
|
|
|
void free_uid(struct user_struct *up)
|
|
{
|
|
unsigned long flags;
|
|
|
|
if (!up)
|
|
return;
|
|
|
|
local_irq_save(flags);
|
|
if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
|
|
free_user(up, flags);
|
|
else
|
|
local_irq_restore(flags);
|
|
}
|
|
|
|
struct user_struct *alloc_uid(kuid_t uid)
|
|
{
|
|
struct hlist_head *hashent = uidhashentry(uid);
|
|
struct user_struct *up, *new;
|
|
|
|
spin_lock_irq(&uidhash_lock);
|
|
up = uid_hash_find(uid, hashent);
|
|
spin_unlock_irq(&uidhash_lock);
|
|
|
|
if (!up) {
|
|
new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL);
|
|
if (!new)
|
|
goto out_unlock;
|
|
|
|
new->uid = uid;
|
|
atomic_set(&new->__count, 1);
|
|
|
|
/*
|
|
* Before adding this, check whether we raced
|
|
* on adding the same user already..
|
|
*/
|
|
spin_lock_irq(&uidhash_lock);
|
|
up = uid_hash_find(uid, hashent);
|
|
if (up) {
|
|
key_put(new->uid_keyring);
|
|
key_put(new->session_keyring);
|
|
kmem_cache_free(uid_cachep, new);
|
|
} else {
|
|
uid_hash_insert(new, hashent);
|
|
up = new;
|
|
}
|
|
spin_unlock_irq(&uidhash_lock);
|
|
}
|
|
|
|
return up;
|
|
|
|
out_unlock:
|
|
return NULL;
|
|
}
|
|
|
|
static int __init uid_cache_init(void)
|
|
{
|
|
int n;
|
|
|
|
uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct),
|
|
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
|
|
|
|
for(n = 0; n < UIDHASH_SZ; ++n)
|
|
INIT_HLIST_HEAD(uidhash_table + n);
|
|
|
|
/* Insert the root user immediately (init already runs as root) */
|
|
spin_lock_irq(&uidhash_lock);
|
|
uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID));
|
|
spin_unlock_irq(&uidhash_lock);
|
|
|
|
return 0;
|
|
}
|
|
subsys_initcall(uid_cache_init);
|