Linux v3.17.8

This commit is contained in:
Justin M. Forbes 2015-01-08 17:04:10 -06:00
parent 0ace95ceec
commit aa5585d52d
23 changed files with 6 additions and 1917 deletions

View File

@ -1,42 +0,0 @@
From: Joe Thornber <ejt@redhat.com>
Date: Thu, 27 Nov 2014 12:26:46 +0000
Subject: [PATCH] dm cache: dirty flag was mistakenly being cleared when
promoting via overwrite
If the incoming bio is a WRITE and completely covers a block then we
don't bother to do any copying for a promotion operation. Once this is
done the cache block and origin block will be different, so we need to
set it to 'dirty'.
Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
---
drivers/md/dm-cache-target.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 6f7086355691..387b93d81138 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -951,10 +951,14 @@ static void migration_success_post_commit(struct dm_cache_migration *mg)
}
} else {
- clear_dirty(cache, mg->new_oblock, mg->cblock);
- if (mg->requeue_holder)
+ if (mg->requeue_holder) {
+ clear_dirty(cache, mg->new_oblock, mg->cblock);
cell_defer(cache, mg->new_ocell, true);
- else {
+ } else {
+ /*
+ * The block was promoted via an overwrite, so it's dirty.
+ */
+ set_dirty(cache, mg->new_oblock, mg->cblock);
bio_endio(mg->new_ocell->holder, 0);
cell_defer(cache, mg->new_ocell, false);
}
--
2.1.0

View File

@ -1,40 +0,0 @@
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 28 Nov 2014 09:48:25 +0000
Subject: [PATCH] dm cache: fix spurious cell_defer when dealing with partial
block at end of device
We never bother caching a partial block that is at the back end of the
origin device. No cell ever gets locked, but the calling code was
assuming it was and trying to release it.
Now the code only releases if the cell has been set to a non NULL
value.
Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
---
drivers/md/dm-cache-target.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 387b93d81138..da496cfb458d 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2554,11 +2554,11 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
static int cache_map(struct dm_target *ti, struct bio *bio)
{
int r;
- struct dm_bio_prison_cell *cell;
+ struct dm_bio_prison_cell *cell = NULL;
struct cache *cache = ti->private;
r = __cache_map(cache, bio, &cell);
- if (r == DM_MAPIO_REMAPPED) {
+ if (r == DM_MAPIO_REMAPPED && cell) {
inc_ds(cache, bio, cell);
cell_defer(cache, cell, false);
}
--
2.1.0

View File

@ -1,32 +0,0 @@
From: Joe Thornber <ejt@redhat.com>
Date: Thu, 27 Nov 2014 12:21:08 +0000
Subject: [PATCH] dm cache: only use overwrite optimisation for promotion when
in writeback mode
Overwrite causes the cache block and origin blocks to diverge, which
is only allowed in writeback mode.
Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
---
drivers/md/dm-cache-target.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 7130505c2425..6f7086355691 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -1070,7 +1070,8 @@ static void issue_copy(struct dm_cache_migration *mg)
avoid = is_discarded_oblock(cache, mg->new_oblock);
- if (!avoid && bio_writes_complete_block(cache, bio)) {
+ if (writeback_mode(&cache->features) &&
+ !avoid && bio_writes_complete_block(cache, bio)) {
issue_overwrite(mg, bio);
return;
}
--
2.1.0

View File

@ -1,90 +0,0 @@
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 5 Dec 2014 17:19:27 -0600
Subject: [PATCH] groups: Consolidate the setgroups permission checks
Today there are 3 instances of setgroups and due to an oversight their
permission checking has diverged. Add a common function so that
they may all share the same permission checking code.
This corrects the current oversight in the current permission checks
and adds a helper to avoid this in the future.
A user namespace security fix will update this new helper, shortly.
Cc: stable@vger.kernel.org
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
arch/s390/kernel/compat_linux.c | 2 +-
include/linux/cred.h | 1 +
kernel/groups.c | 9 ++++++++-
kernel/uid16.c | 2 +-
4 files changed, 11 insertions(+), 3 deletions(-)
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index ca38139423ae..437e61159279 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -249,7 +249,7 @@ COMPAT_SYSCALL_DEFINE2(s390_setgroups16, int, gidsetsize, u16 __user *, grouplis
struct group_info *group_info;
int retval;
- if (!capable(CAP_SETGID))
+ if (!may_setgroups())
return -EPERM;
if ((unsigned)gidsetsize > NGROUPS_MAX)
return -EINVAL;
diff --git a/include/linux/cred.h b/include/linux/cred.h
index b2d0820837c4..2fb2ca2127ed 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -68,6 +68,7 @@ extern void groups_free(struct group_info *);
extern int set_current_groups(struct group_info *);
extern void set_groups(struct cred *, struct group_info *);
extern int groups_search(const struct group_info *, kgid_t);
+extern bool may_setgroups(void);
/* access the groups "array" with this macro */
#define GROUP_AT(gi, i) \
diff --git a/kernel/groups.c b/kernel/groups.c
index 451698f86cfa..02d8a251c476 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -213,6 +213,13 @@ out:
return i;
}
+bool may_setgroups(void)
+{
+ struct user_namespace *user_ns = current_user_ns();
+
+ return ns_capable(user_ns, CAP_SETGID);
+}
+
/*
* SMP: Our groups are copy-on-write. We can set them safely
* without another task interfering.
@@ -223,7 +230,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
struct group_info *group_info;
int retval;
- if (!ns_capable(current_user_ns(), CAP_SETGID))
+ if (!may_setgroups())
return -EPERM;
if ((unsigned)gidsetsize > NGROUPS_MAX)
return -EINVAL;
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 602e5bbbceff..d58cc4d8f0d1 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -176,7 +176,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
struct group_info *group_info;
int retval;
- if (!ns_capable(current_user_ns(), CAP_SETGID))
+ if (!may_setgroups())
return -EPERM;
if ((unsigned)gidsetsize > NGROUPS_MAX)
return -EINVAL;
--
2.1.0

View File

@ -1,54 +0,0 @@
From: Jan Kara <jack@suse.cz>
Date: Mon, 15 Dec 2014 14:22:46 +0100
Subject: [PATCH] isofs: Fix infinite looping over CE entries
Rock Ridge extensions define so called Continuation Entries (CE) which
define where is further space with Rock Ridge data. Corrupted isofs
image can contain arbitrarily long chain of these, including a one
containing loop and thus causing kernel to end in an infinite loop when
traversing these entries.
Limit the traversal to 32 entries which should be more than enough space
to store all the Rock Ridge data.
Reported-by: P J P <ppandit@redhat.com>
CC: stable@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
---
fs/isofs/rock.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index f488bbae541a..bb63254ed848 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -30,6 +30,7 @@ struct rock_state {
int cont_size;
int cont_extent;
int cont_offset;
+ int cont_loops;
struct inode *inode;
};
@@ -73,6 +74,9 @@ static void init_rock_state(struct rock_state *rs, struct inode *inode)
rs->inode = inode;
}
+/* Maximum number of Rock Ridge continuation entries */
+#define RR_MAX_CE_ENTRIES 32
+
/*
* Returns 0 if the caller should continue scanning, 1 if the scan must end
* and -ve on error.
@@ -105,6 +109,8 @@ static int rock_continue(struct rock_state *rs)
goto out;
}
ret = -EIO;
+ if (++rs->cont_loops >= RR_MAX_CE_ENTRIES)
+ goto out;
bh = sb_bread(rs->inode->i_sb, rs->cont_extent);
if (bh) {
memcpy(rs->buffer, bh->b_data + rs->cont_offset,
--
2.1.0

View File

@ -42,7 +42,7 @@ Summary: The Linux kernel
# For non-released -rc kernels, this will be appended after the rcX and
# gitX tags, so a 3 here would become part of release "0.rcX.gitX.3"
#
%global baserelease 301
%global baserelease 300
%global fedora_build %{baserelease}
# base_sublevel is the kernel version we're starting with and patching
@ -54,7 +54,7 @@ Summary: The Linux kernel
%if 0%{?released_kernel}
# Do we have a -stable update to apply?
%define stable_update 7
%define stable_update 8
# Set rpm version accordingly
%if 0%{?stable_update}
%define stablerev %{stable_update}
@ -624,9 +624,6 @@ Patch26058: asus-nb-wmi-Add-wapf4-quirk-for-the-X550VB.patch
#rhbz 1135338
Patch26090: HID-add-support-for-MS-Surface-Pro-3-Type-Cover.patch
#CVE-2014-8134 rhbz 1172765 1172769
Patch26091: x86-kvm-Clear-paravirt_enabled-on-KVM-guests-for-esp.patch
#rhbz 1164945
Patch26092: xhci-Add-broken-streams-quirk-for-Fresco-Logic-FL100.patch
Patch26093: uas-Add-US_FL_NO_ATA_1X-for-Seagate-devices-with-usb.patch
@ -635,9 +632,6 @@ Patch26094: uas-Add-US_FL_NO_REPORT_OPCODES-for-JMicron-JMS566-w.patch
#rhbz 1172543
Patch26096: cfg80211-don-t-WARN-about-two-consecutive-Country-IE.patch
#CVE-2014-8133 rhbz 1172797 1174374
Patch26100: x86-tls-Validate-TLS-entries-to-protect-espfix.patch
#rhbz 1173806
Patch26101: powerpc-powernv-force-all-CPUs-to-be-bootable.patch
@ -645,43 +639,15 @@ Patch26101: powerpc-powernv-force-all-CPUs-to-be-bootable.patch
Patch26098: move-d_rcu-from-overlapping-d_child-to-overlapping-d.patch
Patch26099: deal-with-deadlock-in-d_walk.patch
#CVE-2014-XXXX rhbz 1175235 1175250
Patch26102: isofs-Fix-infinite-looping-over-CE-entries.patch
#rhbz 1175261
Patch26103: blk-mq-Fix-uninitialized-kobject-at-CPU-hotplugging.patch
#rhbz 1168434
Patch26104: dm-cache-only-use-overwrite-optimisation-for-promoti.patch
Patch26105: dm-cache-dirty-flag-was-mistakenly-being-cleared-whe.patch
Patch26106: dm-cache-fix-spurious-cell_defer-when-dealing-with-p.patch
#mount fixes for stable
Patch26108: mnt-Implicitly-add-MNT_NODEV-on-remount-when-it-was-.patch
Patch26109: mnt-Update-unprivileged-remount-test.patch
Patch26110: umount-Disallow-unprivileged-mount-force.patch
#CVE-2014-8989 rhbz 1170684 1170688
Patch26111: groups-Consolidate-the-setgroups-permission-checks.patch
Patch26112: userns-Document-what-the-invariant-required-for-safe.patch
Patch26113: userns-Don-t-allow-setgroups-until-a-gid-mapping-has.patch
Patch26114: userns-Don-t-allow-unprivileged-creation-of-gid-mapp.patch
Patch26115: userns-Check-euid-no-fsuid-when-establishing-an-unpr.patch
Patch26116: userns-Only-allow-the-creator-of-the-userns-unprivil.patch
Patch26117: userns-Rename-id_map_mutex-to-userns_state_mutex.patch
Patch26118: userns-Add-a-knob-to-disable-setgroups-on-a-per-user.patch
Patch26119: userns-Allow-setting-gid_maps-without-privilege-when.patch
Patch26120: userns-Unbreak-the-unprivileged-remount-tests.patch
#rhbz 1163927
Patch26121: Set-UID-in-sess_auth_rawntlmssp_authenticate-too.patch
#CVE-2014-9428 rhbz 1178826,1178833
Patch26122: batman-adv-Calculate-extra-tail-size-based-on-queued.patch
#CVE-2014-9419 rhbz 1177260,1177263
Patch26123: x86_64-switch_to-Load-TLS-descriptors-before-switchi.patch
#CVE-2014-9529 rhbz 1179813 1179853
Patch26124: KEYS-close-race-between-key-lookup-and-freeing.patch
@ -1418,9 +1384,6 @@ ApplyPatch asus-nb-wmi-Add-wapf4-quirk-for-the-X550VB.patch
#rhbz 1135338
ApplyPatch HID-add-support-for-MS-Surface-Pro-3-Type-Cover.patch
#CVE-2014-8134 rhbz 1172765 1172769
ApplyPatch x86-kvm-Clear-paravirt_enabled-on-KVM-guests-for-esp.patch
#rhbz 1164945
ApplyPatch xhci-Add-broken-streams-quirk-for-Fresco-Logic-FL100.patch
ApplyPatch uas-Add-US_FL_NO_ATA_1X-for-Seagate-devices-with-usb.patch
@ -1429,9 +1392,6 @@ ApplyPatch uas-Add-US_FL_NO_REPORT_OPCODES-for-JMicron-JMS566-w.patch
#rhbz 1172543
ApplyPatch cfg80211-don-t-WARN-about-two-consecutive-Country-IE.patch
#CVE-2014-8133 rhbz 1172797 1174374
ApplyPatch x86-tls-Validate-TLS-entries-to-protect-espfix.patch
#rhbz 1173806
ApplyPatch powerpc-powernv-force-all-CPUs-to-be-bootable.patch
@ -1439,43 +1399,15 @@ ApplyPatch powerpc-powernv-force-all-CPUs-to-be-bootable.patch
ApplyPatch move-d_rcu-from-overlapping-d_child-to-overlapping-d.patch
ApplyPatch deal-with-deadlock-in-d_walk.patch
#CVE-2014-XXXX rhbz 1175235 1175250
ApplyPatch isofs-Fix-infinite-looping-over-CE-entries.patch
#rhbz 1175261
ApplyPatch blk-mq-Fix-uninitialized-kobject-at-CPU-hotplugging.patch
#rhbz 1168434
ApplyPatch dm-cache-only-use-overwrite-optimisation-for-promoti.patch
ApplyPatch dm-cache-dirty-flag-was-mistakenly-being-cleared-whe.patch
ApplyPatch dm-cache-fix-spurious-cell_defer-when-dealing-with-p.patch
#mount fixes for stable
ApplyPatch mnt-Implicitly-add-MNT_NODEV-on-remount-when-it-was-.patch
ApplyPatch mnt-Update-unprivileged-remount-test.patch
ApplyPatch umount-Disallow-unprivileged-mount-force.patch
#CVE-2014-8989 rhbz 1170684 1170688
ApplyPatch groups-Consolidate-the-setgroups-permission-checks.patch
ApplyPatch userns-Document-what-the-invariant-required-for-safe.patch
ApplyPatch userns-Don-t-allow-setgroups-until-a-gid-mapping-has.patch
ApplyPatch userns-Don-t-allow-unprivileged-creation-of-gid-mapp.patch
ApplyPatch userns-Check-euid-no-fsuid-when-establishing-an-unpr.patch
ApplyPatch userns-Only-allow-the-creator-of-the-userns-unprivil.patch
ApplyPatch userns-Rename-id_map_mutex-to-userns_state_mutex.patch
ApplyPatch userns-Add-a-knob-to-disable-setgroups-on-a-per-user.patch
ApplyPatch userns-Allow-setting-gid_maps-without-privilege-when.patch
ApplyPatch userns-Unbreak-the-unprivileged-remount-tests.patch
#rhbz 1163927
ApplyPatch Set-UID-in-sess_auth_rawntlmssp_authenticate-too.patch
#CVE-2014-9428 rhbz 1178826,1178833
ApplyPatch batman-adv-Calculate-extra-tail-size-based-on-queued.patch
#CVE-2014-9419 rhbz 1177260,1177263
ApplyPatch x86_64-switch_to-Load-TLS-descriptors-before-switchi.patch
#CVE-2014-9529 rhbz 1179813 1179853
ApplyPatch KEYS-close-race-between-key-lookup-and-freeing.patch
@ -2353,6 +2285,9 @@ fi
# ||----w |
# || ||
%changelog
* Thu Jan 08 2015 Justin M. Forbes <jforbes@fedoraproject.org> - 3.17.8-300
- Linux v3.17.8
* Wed Jan 07 2015 Josh Boyer <jwboyer@fedoraproject.org>
- CVE-2014-9529 memory corruption or panic during key gc (rhbz 1179813 1179853)
- Enable POWERCAP and INTEL_RAPL

View File

@ -1,41 +0,0 @@
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 13 Aug 2014 01:33:38 -0700
Subject: [PATCH] mnt: Implicitly add MNT_NODEV on remount when it was
implicitly added by mount
Now that remount is properly enforcing the rule that you can't remove
nodev at least sandstorm.io is breaking when performing a remount.
It turns out that there is an easy intuitive solution implicitly
add nodev on remount when nodev was implicitly added on mount.
Tested-by: Cedric Bosdonnat <cbosdonnat@suse.com>
Tested-by: Richard Weinberger <richard@nod.at>
Cc: stable@vger.kernel.org
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
fs/namespace.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/fs/namespace.c b/fs/namespace.c
index 550dbff08677..72b41e49772d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1955,7 +1955,13 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
}
if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
!(mnt_flags & MNT_NODEV)) {
- return -EPERM;
+ /* Was the nodev implicitly added in mount? */
+ if ((mnt->mnt_ns->user_ns != &init_user_ns) &&
+ !(sb->s_type->fs_flags & FS_USERNS_DEV_MOUNT)) {
+ mnt_flags |= MNT_NODEV;
+ } else {
+ return -EPERM;
+ }
}
if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
!(mnt_flags & MNT_NOSUID)) {
--
2.1.0

View File

@ -1,280 +0,0 @@
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 22 Aug 2014 16:39:03 -0500
Subject: [PATCH] mnt: Update unprivileged remount test
- MNT_NODEV should be irrelevant except when reading back mount flags,
no longer specify MNT_NODEV on remount.
- Test MNT_NODEV on devpts where it is meaningful even for unprivileged mounts.
- Add a test to verify that remount of a prexisting mount with the same flags
is allowed and does not change those flags.
- Cleanup up the definitions of MS_REC, MS_RELATIME, MS_STRICTATIME that are used
when the code is built in an environment without them.
- Correct the test error messages when tests fail. There were not 5 tests
that tested MS_RELATIME.
Cc: stable@vger.kernel.org
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
.../selftests/mount/unprivileged-remount-test.c | 172 +++++++++++++++++----
1 file changed, 142 insertions(+), 30 deletions(-)
diff --git a/tools/testing/selftests/mount/unprivileged-remount-test.c b/tools/testing/selftests/mount/unprivileged-remount-test.c
index 1b3ff2fda4d0..9669d375625a 100644
--- a/tools/testing/selftests/mount/unprivileged-remount-test.c
+++ b/tools/testing/selftests/mount/unprivileged-remount-test.c
@@ -6,6 +6,8 @@
#include <sys/types.h>
#include <sys/mount.h>
#include <sys/wait.h>
+#include <sys/vfs.h>
+#include <sys/statvfs.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
@@ -32,11 +34,14 @@
# define CLONE_NEWPID 0x20000000
#endif
+#ifndef MS_REC
+# define MS_REC 16384
+#endif
#ifndef MS_RELATIME
-#define MS_RELATIME (1 << 21)
+# define MS_RELATIME (1 << 21)
#endif
#ifndef MS_STRICTATIME
-#define MS_STRICTATIME (1 << 24)
+# define MS_STRICTATIME (1 << 24)
#endif
static void die(char *fmt, ...)
@@ -87,6 +92,45 @@ static void write_file(char *filename, char *fmt, ...)
}
}
+static int read_mnt_flags(const char *path)
+{
+ int ret;
+ struct statvfs stat;
+ int mnt_flags;
+
+ ret = statvfs(path, &stat);
+ if (ret != 0) {
+ die("statvfs of %s failed: %s\n",
+ path, strerror(errno));
+ }
+ if (stat.f_flag & ~(ST_RDONLY | ST_NOSUID | ST_NODEV | \
+ ST_NOEXEC | ST_NOATIME | ST_NODIRATIME | ST_RELATIME | \
+ ST_SYNCHRONOUS | ST_MANDLOCK)) {
+ die("Unrecognized mount flags\n");
+ }
+ mnt_flags = 0;
+ if (stat.f_flag & ST_RDONLY)
+ mnt_flags |= MS_RDONLY;
+ if (stat.f_flag & ST_NOSUID)
+ mnt_flags |= MS_NOSUID;
+ if (stat.f_flag & ST_NODEV)
+ mnt_flags |= MS_NODEV;
+ if (stat.f_flag & ST_NOEXEC)
+ mnt_flags |= MS_NOEXEC;
+ if (stat.f_flag & ST_NOATIME)
+ mnt_flags |= MS_NOATIME;
+ if (stat.f_flag & ST_NODIRATIME)
+ mnt_flags |= MS_NODIRATIME;
+ if (stat.f_flag & ST_RELATIME)
+ mnt_flags |= MS_RELATIME;
+ if (stat.f_flag & ST_SYNCHRONOUS)
+ mnt_flags |= MS_SYNCHRONOUS;
+ if (stat.f_flag & ST_MANDLOCK)
+ mnt_flags |= ST_MANDLOCK;
+
+ return mnt_flags;
+}
+
static void create_and_enter_userns(void)
{
uid_t uid;
@@ -118,7 +162,8 @@ static void create_and_enter_userns(void)
}
static
-bool test_unpriv_remount(int mount_flags, int remount_flags, int invalid_flags)
+bool test_unpriv_remount(const char *fstype, const char *mount_options,
+ int mount_flags, int remount_flags, int invalid_flags)
{
pid_t child;
@@ -151,9 +196,11 @@ bool test_unpriv_remount(int mount_flags, int remount_flags, int invalid_flags)
strerror(errno));
}
- if (mount("testing", "/tmp", "ramfs", mount_flags, NULL) != 0) {
- die("mount of /tmp failed: %s\n",
- strerror(errno));
+ if (mount("testing", "/tmp", fstype, mount_flags, mount_options) != 0) {
+ die("mount of %s with options '%s' on /tmp failed: %s\n",
+ fstype,
+ mount_options? mount_options : "",
+ strerror(errno));
}
create_and_enter_userns();
@@ -181,62 +228,127 @@ bool test_unpriv_remount(int mount_flags, int remount_flags, int invalid_flags)
static bool test_unpriv_remount_simple(int mount_flags)
{
- return test_unpriv_remount(mount_flags, mount_flags, 0);
+ return test_unpriv_remount("ramfs", NULL, mount_flags, mount_flags, 0);
}
static bool test_unpriv_remount_atime(int mount_flags, int invalid_flags)
{
- return test_unpriv_remount(mount_flags, mount_flags, invalid_flags);
+ return test_unpriv_remount("ramfs", NULL, mount_flags, mount_flags,
+ invalid_flags);
+}
+
+static bool test_priv_mount_unpriv_remount(void)
+{
+ pid_t child;
+ int ret;
+ const char *orig_path = "/dev";
+ const char *dest_path = "/tmp";
+ int orig_mnt_flags, remount_mnt_flags;
+
+ child = fork();
+ if (child == -1) {
+ die("fork failed: %s\n",
+ strerror(errno));
+ }
+ if (child != 0) { /* parent */
+ pid_t pid;
+ int status;
+ pid = waitpid(child, &status, 0);
+ if (pid == -1) {
+ die("waitpid failed: %s\n",
+ strerror(errno));
+ }
+ if (pid != child) {
+ die("waited for %d got %d\n",
+ child, pid);
+ }
+ if (!WIFEXITED(status)) {
+ die("child did not terminate cleanly\n");
+ }
+ return WEXITSTATUS(status) == EXIT_SUCCESS ? true : false;
+ }
+
+ orig_mnt_flags = read_mnt_flags(orig_path);
+
+ create_and_enter_userns();
+ ret = unshare(CLONE_NEWNS);
+ if (ret != 0) {
+ die("unshare(CLONE_NEWNS) failed: %s\n",
+ strerror(errno));
+ }
+
+ ret = mount(orig_path, dest_path, "bind", MS_BIND | MS_REC, NULL);
+ if (ret != 0) {
+ die("recursive bind mount of %s onto %s failed: %s\n",
+ orig_path, dest_path, strerror(errno));
+ }
+
+ ret = mount(dest_path, dest_path, "none",
+ MS_REMOUNT | MS_BIND | orig_mnt_flags , NULL);
+ if (ret != 0) {
+ /* system("cat /proc/self/mounts"); */
+ die("remount of /tmp failed: %s\n",
+ strerror(errno));
+ }
+
+ remount_mnt_flags = read_mnt_flags(dest_path);
+ if (orig_mnt_flags != remount_mnt_flags) {
+ die("Mount flags unexpectedly changed during remount of %s originally mounted on %s\n",
+ dest_path, orig_path);
+ }
+ exit(EXIT_SUCCESS);
}
int main(int argc, char **argv)
{
- if (!test_unpriv_remount_simple(MS_RDONLY|MS_NODEV)) {
+ if (!test_unpriv_remount_simple(MS_RDONLY)) {
die("MS_RDONLY malfunctions\n");
}
- if (!test_unpriv_remount_simple(MS_NODEV)) {
+ if (!test_unpriv_remount("devpts", "newinstance", MS_NODEV, MS_NODEV, 0)) {
die("MS_NODEV malfunctions\n");
}
- if (!test_unpriv_remount_simple(MS_NOSUID|MS_NODEV)) {
+ if (!test_unpriv_remount_simple(MS_NOSUID)) {
die("MS_NOSUID malfunctions\n");
}
- if (!test_unpriv_remount_simple(MS_NOEXEC|MS_NODEV)) {
+ if (!test_unpriv_remount_simple(MS_NOEXEC)) {
die("MS_NOEXEC malfunctions\n");
}
- if (!test_unpriv_remount_atime(MS_RELATIME|MS_NODEV,
- MS_NOATIME|MS_NODEV))
+ if (!test_unpriv_remount_atime(MS_RELATIME,
+ MS_NOATIME))
{
die("MS_RELATIME malfunctions\n");
}
- if (!test_unpriv_remount_atime(MS_STRICTATIME|MS_NODEV,
- MS_NOATIME|MS_NODEV))
+ if (!test_unpriv_remount_atime(MS_STRICTATIME,
+ MS_NOATIME))
{
die("MS_STRICTATIME malfunctions\n");
}
- if (!test_unpriv_remount_atime(MS_NOATIME|MS_NODEV,
- MS_STRICTATIME|MS_NODEV))
+ if (!test_unpriv_remount_atime(MS_NOATIME,
+ MS_STRICTATIME))
{
- die("MS_RELATIME malfunctions\n");
+ die("MS_NOATIME malfunctions\n");
}
- if (!test_unpriv_remount_atime(MS_RELATIME|MS_NODIRATIME|MS_NODEV,
- MS_NOATIME|MS_NODEV))
+ if (!test_unpriv_remount_atime(MS_RELATIME|MS_NODIRATIME,
+ MS_NOATIME))
{
- die("MS_RELATIME malfunctions\n");
+ die("MS_RELATIME|MS_NODIRATIME malfunctions\n");
}
- if (!test_unpriv_remount_atime(MS_STRICTATIME|MS_NODIRATIME|MS_NODEV,
- MS_NOATIME|MS_NODEV))
+ if (!test_unpriv_remount_atime(MS_STRICTATIME|MS_NODIRATIME,
+ MS_NOATIME))
{
- die("MS_RELATIME malfunctions\n");
+ die("MS_STRICTATIME|MS_NODIRATIME malfunctions\n");
}
- if (!test_unpriv_remount_atime(MS_NOATIME|MS_NODIRATIME|MS_NODEV,
- MS_STRICTATIME|MS_NODEV))
+ if (!test_unpriv_remount_atime(MS_NOATIME|MS_NODIRATIME,
+ MS_STRICTATIME))
{
- die("MS_RELATIME malfunctions\n");
+ die("MS_NOATIME|MS_DIRATIME malfunctions\n");
}
- if (!test_unpriv_remount(MS_STRICTATIME|MS_NODEV, MS_NODEV,
- MS_NOATIME|MS_NODEV))
+ if (!test_unpriv_remount("ramfs", NULL, MS_STRICTATIME, 0, MS_NOATIME))
{
die("Default atime malfunctions\n");
}
+ if (!test_priv_mount_unpriv_remount()) {
+ die("Mount flags unexpectedly changed after remount\n");
+ }
return EXIT_SUCCESS;
}
--
2.1.0

BIN
perf-man.tar.gz Normal file

Binary file not shown.

View File

@ -1,3 +1,3 @@
fb30d0f29214d75cddd2faa94f73d5cf linux-3.17.tar.xz
159e969cbc27201d8e2fa0f609dc722f perf-man-3.17.tar.gz
96d5959bdc223fa6aa0ed132c93cf814 patch-3.17.7.xz
4ea1c0e18b18406bcd248bf06b95aec3 patch-3.17.8.xz

View File

@ -1,33 +0,0 @@
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 4 Oct 2014 14:44:03 -0700
Subject: [PATCH] umount: Disallow unprivileged mount force
Forced unmount affects not just the mount namespace but the underlying
superblock as well. Restrict forced unmount to the global root user
for now. Otherwise it becomes possible a user in a less privileged
mount namespace to force the shutdown of a superblock of a filesystem
in a more privileged mount namespace, allowing a DOS attack on root.
Cc: stable@vger.kernel.org
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
fs/namespace.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/fs/namespace.c b/fs/namespace.c
index 72b41e49772d..90707be662f2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1430,6 +1430,9 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
goto dput_and_out;
if (mnt->mnt.mnt_flags & MNT_LOCKED)
goto dput_and_out;
+ retval = -EPERM;
+ if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
+ goto dput_and_out;
retval = do_umount(mnt, flags);
dput_and_out:
--
2.1.0

View File

@ -1,280 +0,0 @@
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 2 Dec 2014 12:27:26 -0600
Subject: [PATCH] userns: Add a knob to disable setgroups on a per user
namespace basis
- Expose the knob to user space through a proc file /proc/<pid>/setgroups
A value of "deny" means the setgroups system call is disabled in the
current processes user namespace and can not be enabled in the
future in this user namespace.
A value of "allow" means the segtoups system call is enabled.
- Descendant user namespaces inherit the value of setgroups from
their parents.
- A proc file is used (instead of a sysctl) as sysctls currently do
not allow checking the permissions at open time.
- Writing to the proc file is restricted to before the gid_map
for the user namespace is set.
This ensures that disabling setgroups at a user namespace
level will never remove the ability to call setgroups
from a process that already has that ability.
A process may opt in to the setgroups disable for itself by
creating, entering and configuring a user namespace or by calling
setns on an existing user namespace with setgroups disabled.
Processes without privileges already can not call setgroups so this
is a noop. Prodcess with privilege become processes without
privilege when entering a user namespace and as with any other path
to dropping privilege they would not have the ability to call
setgroups. So this remains within the bounds of what is possible
without a knob to disable setgroups permanently in a user namespace.
Cc: stable@vger.kernel.org
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
fs/proc/base.c | 53 ++++++++++++++++++++++++++
include/linux/user_namespace.h | 7 ++++
kernel/user.c | 1 +
kernel/user_namespace.c | 85 ++++++++++++++++++++++++++++++++++++++++++
4 files changed, 146 insertions(+)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index baf852b648ad..3ec60dee75da 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2493,6 +2493,57 @@ static const struct file_operations proc_projid_map_operations = {
.llseek = seq_lseek,
.release = proc_id_map_release,
};
+
+static int proc_setgroups_open(struct inode *inode, struct file *file)
+{
+ struct user_namespace *ns = NULL;
+ struct task_struct *task;
+ int ret;
+
+ ret = -ESRCH;
+ task = get_proc_task(inode);
+ if (task) {
+ rcu_read_lock();
+ ns = get_user_ns(task_cred_xxx(task, user_ns));
+ rcu_read_unlock();
+ put_task_struct(task);
+ }
+ if (!ns)
+ goto err;
+
+ if (file->f_mode & FMODE_WRITE) {
+ ret = -EACCES;
+ if (!ns_capable(ns, CAP_SYS_ADMIN))
+ goto err_put_ns;
+ }
+
+ ret = single_open(file, &proc_setgroups_show, ns);
+ if (ret)
+ goto err_put_ns;
+
+ return 0;
+err_put_ns:
+ put_user_ns(ns);
+err:
+ return ret;
+}
+
+static int proc_setgroups_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct user_namespace *ns = seq->private;
+ int ret = single_release(inode, file);
+ put_user_ns(ns);
+ return ret;
+}
+
+static const struct file_operations proc_setgroups_operations = {
+ .open = proc_setgroups_open,
+ .write = proc_setgroups_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = proc_setgroups_release,
+};
#endif /* CONFIG_USER_NS */
static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
@@ -2601,6 +2652,7 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
+ REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations),
#endif
#ifdef CONFIG_CHECKPOINT_RESTORE
REG("timers", S_IRUGO, proc_timers_operations),
@@ -2944,6 +2996,7 @@ static const struct pid_entry tid_base_stuff[] = {
REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
+ REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations),
#endif
};
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 8d493083486a..9f3579ff543d 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -17,6 +17,10 @@ struct uid_gid_map { /* 64 bytes -- 1 cache line */
} extent[UID_GID_MAP_MAX_EXTENTS];
};
+#define USERNS_SETGROUPS_ALLOWED 1UL
+
+#define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED
+
struct user_namespace {
struct uid_gid_map uid_map;
struct uid_gid_map gid_map;
@@ -27,6 +31,7 @@ struct user_namespace {
kuid_t owner;
kgid_t group;
unsigned int proc_inum;
+ unsigned long flags;
/* Register of per-UID persistent keyrings for this namespace */
#ifdef CONFIG_PERSISTENT_KEYRINGS
@@ -63,6 +68,8 @@ extern const struct seq_operations proc_projid_seq_operations;
extern ssize_t proc_uid_map_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *);
+extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *);
+extern int proc_setgroups_show(struct seq_file *m, void *v);
extern bool userns_may_setgroups(const struct user_namespace *ns);
#else
diff --git a/kernel/user.c b/kernel/user.c
index 4efa39350e44..2d09940c9632 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -51,6 +51,7 @@ struct user_namespace init_user_ns = {
.owner = GLOBAL_ROOT_UID,
.group = GLOBAL_ROOT_GID,
.proc_inum = PROC_USER_INIT_INO,
+ .flags = USERNS_INIT_FLAGS,
#ifdef CONFIG_PERSISTENT_KEYRINGS
.persistent_keyring_register_sem =
__RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem),
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 44a555ac6104..6e80f4c1322b 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -100,6 +100,11 @@ int create_user_ns(struct cred *new)
ns->owner = owner;
ns->group = group;
+ /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
+ mutex_lock(&userns_state_mutex);
+ ns->flags = parent_ns->flags;
+ mutex_unlock(&userns_state_mutex);
+
set_cred_user_ns(new, ns);
#ifdef CONFIG_PERSISTENT_KEYRINGS
@@ -839,6 +844,84 @@ static bool new_idmap_permitted(const struct file *file,
return false;
}
+int proc_setgroups_show(struct seq_file *seq, void *v)
+{
+ struct user_namespace *ns = seq->private;
+ unsigned long userns_flags = ACCESS_ONCE(ns->flags);
+
+ seq_printf(seq, "%s\n",
+ (userns_flags & USERNS_SETGROUPS_ALLOWED) ?
+ "allow" : "deny");
+ return 0;
+}
+
+ssize_t proc_setgroups_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct seq_file *seq = file->private_data;
+ struct user_namespace *ns = seq->private;
+ char kbuf[8], *pos;
+ bool setgroups_allowed;
+ ssize_t ret;
+
+ /* Only allow a very narrow range of strings to be written */
+ ret = -EINVAL;
+ if ((*ppos != 0) || (count >= sizeof(kbuf)))
+ goto out;
+
+ /* What was written? */
+ ret = -EFAULT;
+ if (copy_from_user(kbuf, buf, count))
+ goto out;
+ kbuf[count] = '\0';
+ pos = kbuf;
+
+ /* What is being requested? */
+ ret = -EINVAL;
+ if (strncmp(pos, "allow", 5) == 0) {
+ pos += 5;
+ setgroups_allowed = true;
+ }
+ else if (strncmp(pos, "deny", 4) == 0) {
+ pos += 4;
+ setgroups_allowed = false;
+ }
+ else
+ goto out;
+
+ /* Verify there is not trailing junk on the line */
+ pos = skip_spaces(pos);
+ if (*pos != '\0')
+ goto out;
+
+ ret = -EPERM;
+ mutex_lock(&userns_state_mutex);
+ if (setgroups_allowed) {
+ /* Enabling setgroups after setgroups has been disabled
+ * is not allowed.
+ */
+ if (!(ns->flags & USERNS_SETGROUPS_ALLOWED))
+ goto out_unlock;
+ } else {
+ /* Permanently disabling setgroups after setgroups has
+ * been enabled by writing the gid_map is not allowed.
+ */
+ if (ns->gid_map.nr_extents != 0)
+ goto out_unlock;
+ ns->flags &= ~USERNS_SETGROUPS_ALLOWED;
+ }
+ mutex_unlock(&userns_state_mutex);
+
+ /* Report a successful write */
+ *ppos = count;
+ ret = count;
+out:
+ return ret;
+out_unlock:
+ mutex_unlock(&userns_state_mutex);
+ goto out;
+}
+
bool userns_may_setgroups(const struct user_namespace *ns)
{
bool allowed;
@@ -848,6 +931,8 @@ bool userns_may_setgroups(const struct user_namespace *ns)
* the user namespace has been established.
*/
allowed = ns->gid_map.nr_extents != 0;
+ /* Is setgroups allowed? */
+ allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED);
mutex_unlock(&userns_state_mutex);
return allowed;
--
2.1.0

View File

@ -1,40 +0,0 @@
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 5 Dec 2014 19:36:04 -0600
Subject: [PATCH] userns: Allow setting gid_maps without privilege when
setgroups is disabled
Now that setgroups can be disabled and not reenabled, setting gid_map
without privielge can now be enabled when setgroups is disabled.
This restores most of the functionality that was lost when unprivileged
setting of gid_map was removed. Applications that use this functionality
will need to check to see if they use setgroups or init_groups, and if they
don't they can be fixed by simply disabling setgroups before writing to
gid_map.
Cc: stable@vger.kernel.org
Reviewed-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
kernel/user_namespace.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 6e80f4c1322b..a2e37c5d2f63 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -826,6 +826,11 @@ static bool new_idmap_permitted(const struct file *file,
kuid_t uid = make_kuid(ns->parent, id);
if (uid_eq(uid, cred->euid))
return true;
+ } else if (cap_setid == CAP_SETGID) {
+ kgid_t gid = make_kgid(ns->parent, id);
+ if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) &&
+ gid_eq(gid, cred->egid))
+ return true;
}
}
--
2.1.0

View File

@ -1,39 +0,0 @@
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 5 Dec 2014 18:26:30 -0600
Subject: [PATCH] userns: Check euid no fsuid when establishing an unprivileged
uid mapping
setresuid allows the euid to be set to any of uid, euid, suid, and
fsuid. Therefor it is safe to allow an unprivileged user to map
their euid and use CAP_SETUID privileged with exactly that uid,
as no new credentials can be obtained.
I can not find a combination of existing system calls that allows setting
uid, euid, suid, and fsuid from the fsuid making the previous use
of fsuid for allowing unprivileged mappings a bug.
This is part of a fix for CVE-2014-8989.
Cc: stable@vger.kernel.org
Reviewed-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
kernel/user_namespace.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 1ce6d67c07b7..9451b12a9b6c 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -819,7 +819,7 @@ static bool new_idmap_permitted(const struct file *file,
u32 id = new_map->extent[0].lower_first;
if (cap_setid == CAP_SETUID) {
kuid_t uid = make_kuid(ns->parent, id);
- if (uid_eq(uid, file->f_cred->fsuid))
+ if (uid_eq(uid, file->f_cred->euid))
return true;
}
}
--
2.1.0

View File

@ -1,48 +0,0 @@
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 5 Dec 2014 17:51:47 -0600
Subject: [PATCH] userns: Document what the invariant required for safe
unprivileged mappings.
The rule is simple. Don't allow anything that wouldn't be allowed
without unprivileged mappings.
It was previously overlooked that establishing gid mappings would
allow dropping groups and potentially gaining permission to files and
directories that had lesser permissions for a specific group than for
all other users.
This is the rule needed to fix CVE-2014-8989 and prevent any other
security issues with new_idmap_permitted.
The reason for this rule is that the unix permission model is old and
there are programs out there somewhere that take advantage of every
little corner of it. So allowing a uid or gid mapping to be
established without privielge that would allow anything that would not
be allowed without that mapping will result in expectations from some
code somewhere being violated. Violated expectations about the
behavior of the OS is a long way to say a security issue.
Cc: stable@vger.kernel.org
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
kernel/user_namespace.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index aa312b0dc3ec..b99c862a2e3f 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -812,7 +812,9 @@ static bool new_idmap_permitted(const struct file *file,
struct user_namespace *ns, int cap_setid,
struct uid_gid_map *new_map)
{
- /* Allow mapping to your own filesystem ids */
+ /* Don't allow mappings that would allow anything that wouldn't
+ * be allowed without the establishment of unprivileged mappings.
+ */
if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) {
u32 id = new_map->extent[0].lower_first;
if (cap_setid == CAP_SETUID) {
--
2.1.0

View File

@ -1,98 +0,0 @@
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 5 Dec 2014 18:01:11 -0600
Subject: [PATCH] userns: Don't allow setgroups until a gid mapping has been
setablished
setgroups is unique in not needing a valid mapping before it can be called,
in the case of setgroups(0, NULL) which drops all supplemental groups.
The design of the user namespace assumes that CAP_SETGID can not actually
be used until a gid mapping is established. Therefore add a helper function
to see if the user namespace gid mapping has been established and call
that function in the setgroups permission check.
This is part of the fix for CVE-2014-8989, being able to drop groups
without privilege using user namespaces.
Cc: stable@vger.kernel.org
Reviewed-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
include/linux/user_namespace.h | 5 +++++
kernel/groups.c | 4 +++-
kernel/user_namespace.c | 14 ++++++++++++++
3 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index e95372654f09..8d493083486a 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -63,6 +63,7 @@ extern const struct seq_operations proc_projid_seq_operations;
extern ssize_t proc_uid_map_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *);
+extern bool userns_may_setgroups(const struct user_namespace *ns);
#else
static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
@@ -87,6 +88,10 @@ static inline void put_user_ns(struct user_namespace *ns)
{
}
+static inline bool userns_may_setgroups(const struct user_namespace *ns)
+{
+ return true;
+}
#endif
#endif /* _LINUX_USER_H */
diff --git a/kernel/groups.c b/kernel/groups.c
index 02d8a251c476..664411f171b5 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -6,6 +6,7 @@
#include <linux/slab.h>
#include <linux/security.h>
#include <linux/syscalls.h>
+#include <linux/user_namespace.h>
#include <asm/uaccess.h>
/* init to 2 - one for init_task, one to ensure it is never freed */
@@ -217,7 +218,8 @@ bool may_setgroups(void)
{
struct user_namespace *user_ns = current_user_ns();
- return ns_capable(user_ns, CAP_SETGID);
+ return ns_capable(user_ns, CAP_SETGID) &&
+ userns_may_setgroups(user_ns);
}
/*
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index b99c862a2e3f..27c8dab48c07 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -843,6 +843,20 @@ static bool new_idmap_permitted(const struct file *file,
return false;
}
+bool userns_may_setgroups(const struct user_namespace *ns)
+{
+ bool allowed;
+
+ mutex_lock(&id_map_mutex);
+ /* It is not safe to use setgroups until a gid mapping in
+ * the user namespace has been established.
+ */
+ allowed = ns->gid_map.nr_extents != 0;
+ mutex_unlock(&id_map_mutex);
+
+ return allowed;
+}
+
static void *userns_get(struct task_struct *task)
{
struct user_namespace *user_ns;
--
2.1.0

View File

@ -1,46 +0,0 @@
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 5 Dec 2014 18:14:19 -0600
Subject: [PATCH] userns: Don't allow unprivileged creation of gid mappings
As any gid mapping will allow and must allow for backwards
compatibility dropping groups don't allow any gid mappings to be
established without CAP_SETGID in the parent user namespace.
For a small class of applications this change breaks userspace
and removes useful functionality. This small class of applications
includes tools/testing/selftests/mount/unprivilged-remount-test.c
Most of the removed functionality will be added back with the addition
of a one way knob to disable setgroups. Once setgroups is disabled
setting the gid_map becomes as safe as setting the uid_map.
For more common applications that set the uid_map and the gid_map
with privilege this change will have no affect.
This is part of a fix for CVE-2014-8989.
Cc: stable@vger.kernel.org
Reviewed-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
kernel/user_namespace.c | 4 ----
1 file changed, 4 deletions(-)
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 27c8dab48c07..1ce6d67c07b7 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -821,10 +821,6 @@ static bool new_idmap_permitted(const struct file *file,
kuid_t uid = make_kuid(ns->parent, id);
if (uid_eq(uid, file->f_cred->fsuid))
return true;
- } else if (cap_setid == CAP_SETGID) {
- kgid_t gid = make_kgid(ns->parent, id);
- if (gid_eq(gid, file->f_cred->fsgid))
- return true;
}
}
--
2.1.0

View File

@ -1,54 +0,0 @@
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 26 Nov 2014 23:22:14 -0600
Subject: [PATCH] userns: Only allow the creator of the userns unprivileged
mappings
If you did not create the user namespace and are allowed
to write to uid_map or gid_map you should already have the necessary
privilege in the parent user namespace to establish any mapping
you want so this will not affect userspace in practice.
Limiting unprivileged uid mapping establishment to the creator of the
user namespace makes it easier to verify all credentials obtained with
the uid mapping can be obtained without the uid mapping without
privilege.
Limiting unprivileged gid mapping establishment (which is temporarily
absent) to the creator of the user namespace also ensures that the
combination of uid and gid can already be obtained without privilege.
This is part of the fix for CVE-2014-8989.
Cc: stable@vger.kernel.org
Reviewed-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
kernel/user_namespace.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 9451b12a9b6c..1e34de2fbd60 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -812,14 +812,16 @@ static bool new_idmap_permitted(const struct file *file,
struct user_namespace *ns, int cap_setid,
struct uid_gid_map *new_map)
{
+ const struct cred *cred = file->f_cred;
/* Don't allow mappings that would allow anything that wouldn't
* be allowed without the establishment of unprivileged mappings.
*/
- if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) {
+ if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) &&
+ uid_eq(ns->owner, cred->euid)) {
u32 id = new_map->extent[0].lower_first;
if (cap_setid == CAP_SETUID) {
kuid_t uid = make_kuid(ns->parent, id);
- if (uid_eq(uid, file->f_cred->euid))
+ if (uid_eq(uid, cred->euid))
return true;
}
}
--
2.1.0

View File

@ -1,80 +0,0 @@
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 9 Dec 2014 14:03:14 -0600
Subject: [PATCH] userns: Rename id_map_mutex to userns_state_mutex
Generalize id_map_mutex so it can be used for more state of a user namespace.
Cc: stable@vger.kernel.org
Reviewed-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
kernel/user_namespace.c | 14 ++++++--------
1 file changed, 6 insertions(+), 8 deletions(-)
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 1e34de2fbd60..44a555ac6104 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -24,6 +24,7 @@
#include <linux/fs_struct.h>
static struct kmem_cache *user_ns_cachep __read_mostly;
+static DEFINE_MUTEX(userns_state_mutex);
static bool new_idmap_permitted(const struct file *file,
struct user_namespace *ns, int cap_setid,
@@ -583,9 +584,6 @@ static bool mappings_overlap(struct uid_gid_map *new_map,
return false;
}
-
-static DEFINE_MUTEX(id_map_mutex);
-
static ssize_t map_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos,
int cap_setid,
@@ -602,7 +600,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
ssize_t ret = -EINVAL;
/*
- * The id_map_mutex serializes all writes to any given map.
+ * The userns_state_mutex serializes all writes to any given map.
*
* Any map is only ever written once.
*
@@ -620,7 +618,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
* order and smp_rmb() is guaranteed that we don't have crazy
* architectures returning stale data.
*/
- mutex_lock(&id_map_mutex);
+ mutex_lock(&userns_state_mutex);
ret = -EPERM;
/* Only allow one successful write to the map */
@@ -750,7 +748,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
*ppos = count;
ret = count;
out:
- mutex_unlock(&id_map_mutex);
+ mutex_unlock(&userns_state_mutex);
if (page)
free_page(page);
return ret;
@@ -845,12 +843,12 @@ bool userns_may_setgroups(const struct user_namespace *ns)
{
bool allowed;
- mutex_lock(&id_map_mutex);
+ mutex_lock(&userns_state_mutex);
/* It is not safe to use setgroups until a gid mapping in
* the user namespace has been established.
*/
allowed = ns->gid_map.nr_extents != 0;
- mutex_unlock(&id_map_mutex);
+ mutex_unlock(&userns_state_mutex);
return allowed;
}
--
2.1.0

View File

@ -1,91 +0,0 @@
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 2 Dec 2014 13:56:30 -0600
Subject: [PATCH] userns: Unbreak the unprivileged remount tests
A security fix in caused the way the unprivileged remount tests were
using user namespaces to break. Tweak the way user namespaces are
being used so the test works again.
Cc: stable@vger.kernel.org
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
.../selftests/mount/unprivileged-remount-test.c | 32 ++++++++++++++++------
1 file changed, 24 insertions(+), 8 deletions(-)
diff --git a/tools/testing/selftests/mount/unprivileged-remount-test.c b/tools/testing/selftests/mount/unprivileged-remount-test.c
index 9669d375625a..517785052f1c 100644
--- a/tools/testing/selftests/mount/unprivileged-remount-test.c
+++ b/tools/testing/selftests/mount/unprivileged-remount-test.c
@@ -53,17 +53,14 @@ static void die(char *fmt, ...)
exit(EXIT_FAILURE);
}
-static void write_file(char *filename, char *fmt, ...)
+static void vmaybe_write_file(bool enoent_ok, char *filename, char *fmt, va_list ap)
{
char buf[4096];
int fd;
ssize_t written;
int buf_len;
- va_list ap;
- va_start(ap, fmt);
buf_len = vsnprintf(buf, sizeof(buf), fmt, ap);
- va_end(ap);
if (buf_len < 0) {
die("vsnprintf failed: %s\n",
strerror(errno));
@@ -74,6 +71,8 @@ static void write_file(char *filename, char *fmt, ...)
fd = open(filename, O_WRONLY);
if (fd < 0) {
+ if ((errno == ENOENT) && enoent_ok)
+ return;
die("open of %s failed: %s\n",
filename, strerror(errno));
}
@@ -92,6 +91,26 @@ static void write_file(char *filename, char *fmt, ...)
}
}
+static void maybe_write_file(char *filename, char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vmaybe_write_file(true, filename, fmt, ap);
+ va_end(ap);
+
+}
+
+static void write_file(char *filename, char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vmaybe_write_file(false, filename, fmt, ap);
+ va_end(ap);
+
+}
+
static int read_mnt_flags(const char *path)
{
int ret;
@@ -144,13 +163,10 @@ static void create_and_enter_userns(void)
strerror(errno));
}
+ maybe_write_file("/proc/self/setgroups", "deny");
write_file("/proc/self/uid_map", "0 %d 1", uid);
write_file("/proc/self/gid_map", "0 %d 1", gid);
- if (setgroups(0, NULL) != 0) {
- die("setgroups failed: %s\n",
- strerror(errno));
- }
if (setgid(0) != 0) {
die ("setgid(0) failed %s\n",
strerror(errno));
--
2.1.0

View File

@ -1,72 +0,0 @@
From: Andy Lutomirski <luto@amacapital.net>
Date: Fri, 5 Dec 2014 19:03:28 -0800
Subject: [PATCH] x86, kvm: Clear paravirt_enabled on KVM guests for espfix32's
benefit
paravirt_enabled has the following effects:
- Disables the F00F bug workaround warning. There is no F00F bug
workaround any more because Linux's standard IDT handling already
works around the F00F bug, but the warning still exists. This
is only cosmetic, and, in any event, there is no such thing as
KVM on a CPU with the F00F bug.
- Disables 32-bit APM BIOS detection. On a KVM paravirt system,
there should be no APM BIOS anyway.
- Disables tboot. I think that the tboot code should check the
CPUID hypervisor bit directly if it matters.
- paravirt_enabled disables espfix32. espfix32 should *not* be
disabled under KVM paravirt.
The last point is the purpose of this patch. It fixes a leak of the
high 16 bits of the kernel stack address on 32-bit KVM paravirt
guests.
While I'm at it, this removes pv_info setup from kvmclock. That
code seems to serve no purpose.
Cc: stable@vger.kernel.org
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
---
arch/x86/kernel/kvm.c | 9 ++++++++-
arch/x86/kernel/kvmclock.c | 2 --
2 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 3dd8e2c4d74a..07de51f66deb 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -282,7 +282,14 @@ NOKPROBE_SYMBOL(do_async_page_fault);
static void __init paravirt_ops_setup(void)
{
pv_info.name = "KVM";
- pv_info.paravirt_enabled = 1;
+
+ /*
+ * KVM isn't paravirt in the sense of paravirt_enabled. A KVM
+ * guest kernel works like a bare metal kernel with additional
+ * features, and paravirt_enabled is about features that are
+ * missing.
+ */
+ pv_info.paravirt_enabled = 0;
if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
pv_cpu_ops.io_delay = kvm_io_delay;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index d9156ceecdff..d4d9a8ad7893 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -263,8 +263,6 @@ void __init kvmclock_init(void)
#endif
kvm_get_preset_lpj();
clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
- pv_info.paravirt_enabled = 1;
- pv_info.name = "KVM";
if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
--
2.1.0

View File

@ -1,77 +0,0 @@
From: Andy Lutomirski <luto@amacapital.net>
Date: Thu, 4 Dec 2014 16:48:16 -0800
Subject: [PATCH] x86/tls: Validate TLS entries to protect espfix
Installing a 16-bit RW data segment into the GDT defeats espfix.
AFAICT this will not affect glibc, Wine, or dosemu at all.
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Cc: stable@vger.kernel.org
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: security@kernel.org <security@kernel.org>
Cc: Willy Tarreau <w@1wt.eu>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
arch/x86/kernel/tls.c | 23 +++++++++++++++++++++++
1 file changed, 23 insertions(+)
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index f7fec09e3e3a..e7650bd71109 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -27,6 +27,21 @@ static int get_free_idx(void)
return -ESRCH;
}
+static bool tls_desc_okay(const struct user_desc *info)
+{
+ if (LDT_empty(info))
+ return true;
+
+ /*
+ * espfix is required for 16-bit data segments, but espfix
+ * only works for LDT segments.
+ */
+ if (!info->seg_32bit)
+ return false;
+
+ return true;
+}
+
static void set_tls_desc(struct task_struct *p, int idx,
const struct user_desc *info, int n)
{
@@ -66,6 +81,9 @@ int do_set_thread_area(struct task_struct *p, int idx,
if (copy_from_user(&info, u_info, sizeof(info)))
return -EFAULT;
+ if (!tls_desc_okay(&info))
+ return -EINVAL;
+
if (idx == -1)
idx = info.entry_number;
@@ -192,6 +210,7 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset,
{
struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES];
const struct user_desc *info;
+ int i;
if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
(pos % sizeof(struct user_desc)) != 0 ||
@@ -205,6 +224,10 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset,
else
info = infobuf;
+ for (i = 0; i < count / sizeof(struct user_desc); i++)
+ if (!tls_desc_okay(info + i))
+ return -EINVAL;
+
set_tls_desc(target,
GDT_ENTRY_TLS_MIN + (pos / sizeof(struct user_desc)),
info, count / sizeof(struct user_desc));
--
2.1.0

View File

@ -1,309 +0,0 @@
From: Andy Lutomirski <luto@amacapital.net>
Date: Mon, 8 Dec 2014 13:55:20 -0800
Subject: [PATCH] x86_64, switch_to(): Load TLS descriptors before switching DS
and ES
Otherwise, if buggy user code points DS or ES into the TLS
array, they would be corrupted after a context switch.
This also significantly improves the comments and documents some
gotchas in the code.
Before this patch, the both tests below failed. With this
patch, the es test passes, although the gsbase test still fails.
----- begin es test -----
/*
* Copyright (c) 2014 Andy Lutomirski
* GPL v2
*/
static unsigned short GDT3(int idx)
{
return (idx << 3) | 3;
}
static int create_tls(int idx, unsigned int base)
{
struct user_desc desc = {
.entry_number = idx,
.base_addr = base,
.limit = 0xfffff,
.seg_32bit = 1,
.contents = 0, /* Data, grow-up */
.read_exec_only = 0,
.limit_in_pages = 1,
.seg_not_present = 0,
.useable = 0,
};
if (syscall(SYS_set_thread_area, &desc) != 0)
err(1, "set_thread_area");
return desc.entry_number;
}
int main()
{
int idx = create_tls(-1, 0);
printf("Allocated GDT index %d\n", idx);
unsigned short orig_es;
asm volatile ("mov %%es,%0" : "=rm" (orig_es));
int errors = 0;
int total = 1000;
for (int i = 0; i < total; i++) {
asm volatile ("mov %0,%%es" : : "rm" (GDT3(idx)));
usleep(100);
unsigned short es;
asm volatile ("mov %%es,%0" : "=rm" (es));
asm volatile ("mov %0,%%es" : : "rm" (orig_es));
if (es != GDT3(idx)) {
if (errors == 0)
printf("[FAIL]\tES changed from 0x%hx to 0x%hx\n",
GDT3(idx), es);
errors++;
}
}
if (errors) {
printf("[FAIL]\tES was corrupted %d/%d times\n", errors, total);
return 1;
} else {
printf("[OK]\tES was preserved\n");
return 0;
}
}
----- end es test -----
----- begin gsbase test -----
/*
* gsbase.c, a gsbase test
* Copyright (c) 2014 Andy Lutomirski
* GPL v2
*/
static unsigned char *testptr, *testptr2;
static unsigned char read_gs_testvals(void)
{
unsigned char ret;
asm volatile ("movb %%gs:%1, %0" : "=r" (ret) : "m" (*testptr));
return ret;
}
int main()
{
int errors = 0;
testptr = mmap((void *)0x200000000UL, 1, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, -1, 0);
if (testptr == MAP_FAILED)
err(1, "mmap");
testptr2 = mmap((void *)0x300000000UL, 1, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, -1, 0);
if (testptr2 == MAP_FAILED)
err(1, "mmap");
*testptr = 0;
*testptr2 = 1;
if (syscall(SYS_arch_prctl, ARCH_SET_GS,
(unsigned long)testptr2 - (unsigned long)testptr) != 0)
err(1, "ARCH_SET_GS");
usleep(100);
if (read_gs_testvals() == 1) {
printf("[OK]\tARCH_SET_GS worked\n");
} else {
printf("[FAIL]\tARCH_SET_GS failed\n");
errors++;
}
asm volatile ("mov %0,%%gs" : : "r" (0));
if (read_gs_testvals() == 0) {
printf("[OK]\tWriting 0 to gs worked\n");
} else {
printf("[FAIL]\tWriting 0 to gs failed\n");
errors++;
}
usleep(100);
if (read_gs_testvals() == 0) {
printf("[OK]\tgsbase is still zero\n");
} else {
printf("[FAIL]\tgsbase was corrupted\n");
errors++;
}
return errors == 0 ? 0 : 1;
}
----- end gsbase test -----
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: <stable@vger.kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/509d27c9fec78217691c3dad91cec87e1006b34a.1418075657.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
arch/x86/kernel/process_64.c | 101 +++++++++++++++++++++++++++++++------------
1 file changed, 73 insertions(+), 28 deletions(-)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ca5b02d405c3..166119618afb 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -286,24 +286,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
fpu = switch_fpu_prepare(prev_p, next_p, cpu);
- /*
- * Reload esp0, LDT and the page table pointer:
- */
+ /* Reload esp0 and ss1. */
load_sp0(tss, next);
- /*
- * Switch DS and ES.
- * This won't pick up thread selector changes, but I guess that is ok.
- */
- savesegment(es, prev->es);
- if (unlikely(next->es | prev->es))
- loadsegment(es, next->es);
-
- savesegment(ds, prev->ds);
- if (unlikely(next->ds | prev->ds))
- loadsegment(ds, next->ds);
-
-
/* We must save %fs and %gs before load_TLS() because
* %fs and %gs may be cleared by load_TLS().
*
@@ -312,41 +297,101 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
savesegment(fs, fsindex);
savesegment(gs, gsindex);
+ /*
+ * Load TLS before restoring any segments so that segment loads
+ * reference the correct GDT entries.
+ */
load_TLS(next, cpu);
/*
- * Leave lazy mode, flushing any hypercalls made here.
- * This must be done before restoring TLS segments so
- * the GDT and LDT are properly updated, and must be
- * done before math_state_restore, so the TS bit is up
- * to date.
+ * Leave lazy mode, flushing any hypercalls made here. This
+ * must be done after loading TLS entries in the GDT but before
+ * loading segments that might reference them, and and it must
+ * be done before math_state_restore, so the TS bit is up to
+ * date.
*/
arch_end_context_switch(next_p);
+ /* Switch DS and ES.
+ *
+ * Reading them only returns the selectors, but writing them (if
+ * nonzero) loads the full descriptor from the GDT or LDT. The
+ * LDT for next is loaded in switch_mm, and the GDT is loaded
+ * above.
+ *
+ * We therefore need to write new values to the segment
+ * registers on every context switch unless both the new and old
+ * values are zero.
+ *
+ * Note that we don't need to do anything for CS and SS, as
+ * those are saved and restored as part of pt_regs.
+ */
+ savesegment(es, prev->es);
+ if (unlikely(next->es | prev->es))
+ loadsegment(es, next->es);
+
+ savesegment(ds, prev->ds);
+ if (unlikely(next->ds | prev->ds))
+ loadsegment(ds, next->ds);
+
/*
* Switch FS and GS.
*
- * Segment register != 0 always requires a reload. Also
- * reload when it has changed. When prev process used 64bit
- * base always reload to avoid an information leak.
+ * These are even more complicated than FS and GS: they have
+ * 64-bit bases are that controlled by arch_prctl. Those bases
+ * only differ from the values in the GDT or LDT if the selector
+ * is 0.
+ *
+ * Loading the segment register resets the hidden base part of
+ * the register to 0 or the value from the GDT / LDT. If the
+ * next base address zero, writing 0 to the segment register is
+ * much faster than using wrmsr to explicitly zero the base.
+ *
+ * The thread_struct.fs and thread_struct.gs values are 0
+ * if the fs and gs bases respectively are not overridden
+ * from the values implied by fsindex and gsindex. They
+ * are nonzero, and store the nonzero base addresses, if
+ * the bases are overridden.
+ *
+ * (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) should
+ * be impossible.
+ *
+ * Therefore we need to reload the segment registers if either
+ * the old or new selector is nonzero, and we need to override
+ * the base address if next thread expects it to be overridden.
+ *
+ * This code is unnecessarily slow in the case where the old and
+ * new indexes are zero and the new base is nonzero -- it will
+ * unnecessarily write 0 to the selector before writing the new
+ * base address.
+ *
+ * Note: This all depends on arch_prctl being the only way that
+ * user code can override the segment base. Once wrfsbase and
+ * wrgsbase are enabled, most of this code will need to change.
*/
if (unlikely(fsindex | next->fsindex | prev->fs)) {
loadsegment(fs, next->fsindex);
+
/*
- * Check if the user used a selector != 0; if yes
- * clear 64bit base, since overloaded base is always
- * mapped to the Null selector
+ * If user code wrote a nonzero value to FS, then it also
+ * cleared the overridden base address.
+ *
+ * XXX: if user code wrote 0 to FS and cleared the base
+ * address itself, we won't notice and we'll incorrectly
+ * restore the prior base address next time we reschdule
+ * the process.
*/
if (fsindex)
prev->fs = 0;
}
- /* when next process has a 64bit base use it */
if (next->fs)
wrmsrl(MSR_FS_BASE, next->fs);
prev->fsindex = fsindex;
if (unlikely(gsindex | next->gsindex | prev->gs)) {
load_gs_index(next->gsindex);
+
+ /* This works (and fails) the same way as fsindex above. */
if (gsindex)
prev->gs = 0;
}
--
2.1.0