systemd/0053-seccomp-rework-seccomp-code-to-improve-compat-with-s.patch

2019 lines
75 KiB
Diff
Raw Normal View History

2017-01-31 17:11:17 +00:00
From 71357f8a2ac8b1435e81b11e123c7223340a94c8 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Tue, 27 Dec 2016 15:28:25 +0100
Subject: [PATCH] seccomp: rework seccomp code, to improve compat with some
archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
(cherry picked from commit 469830d1426a91e0897c321fdc8ee428f0a750c1)
---
src/core/execute.c | 466 ++++++++----------------------
src/core/main.c | 34 +--
src/nspawn/nspawn-seccomp.c | 117 ++++----
src/shared/seccomp-util.c | 670 +++++++++++++++++++++++++++++++++++---------
src/shared/seccomp-util.h | 25 +-
src/test/test-execute.c | 1 +
src/test/test-seccomp.c | 272 +++++++++++++++++-
7 files changed, 1016 insertions(+), 569 deletions(-)
diff --git a/src/core/execute.c b/src/core/execute.c
index 59ce0774c4..2dfd43a8f2 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -1184,6 +1184,41 @@ static void rename_process_from_path(const char *path) {
rename_process(process_name);
}
+static bool context_has_address_families(const ExecContext *c) {
+ assert(c);
+
+ return c->address_families_whitelist ||
+ !set_isempty(c->address_families);
+}
+
+static bool context_has_syscall_filters(const ExecContext *c) {
+ assert(c);
+
+ return c->syscall_whitelist ||
+ !set_isempty(c->syscall_filter);
+}
+
+static bool context_has_no_new_privileges(const ExecContext *c) {
+ assert(c);
+
+ if (c->no_new_privileges)
+ return true;
+
+ if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
+ return false;
+
+ /* We need NNP if we have any form of seccomp and are unprivileged */
+ return context_has_address_families(c) ||
+ c->memory_deny_write_execute ||
+ c->restrict_realtime ||
+ exec_context_restrict_namespaces_set(c) ||
+ c->protect_kernel_tunables ||
+ c->protect_kernel_modules ||
+ c->private_devices ||
+ context_has_syscall_filters(c) ||
+ !set_isempty(c->syscall_archs);
+}
+
#ifdef HAVE_SECCOMP
static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
@@ -1197,344 +1232,131 @@ static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
return true;
}
-static int apply_seccomp(const Unit* u, const ExecContext *c) {
- uint32_t negative_action, action;
- scmp_filter_ctx seccomp;
- Iterator i;
- void *id;
- int r;
+static int apply_syscall_filter(const Unit* u, const ExecContext *c) {
+ uint32_t negative_action, default_action, action;
+ assert(u);
assert(c);
- if (skip_seccomp_unavailable(u, "syscall filtering"))
+ if (!context_has_syscall_filters(c))
+ return 0;
+
+ if (skip_seccomp_unavailable(u, "SystemCallFilter="))
return 0;
negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
- seccomp = seccomp_init(c->syscall_whitelist ? negative_action : SCMP_ACT_ALLOW);
- if (!seccomp)
- return -ENOMEM;
-
- if (c->syscall_archs) {
-
- SET_FOREACH(id, c->syscall_archs, i) {
- r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
- if (r == -EEXIST)
- continue;
- if (r < 0)
- goto finish;
- }
-
+ if (c->syscall_whitelist) {
+ default_action = negative_action;
+ action = SCMP_ACT_ALLOW;
} else {
- r = seccomp_add_secondary_archs(seccomp);
- if (r < 0)
- goto finish;
+ default_action = SCMP_ACT_ALLOW;
+ action = negative_action;
}
- action = c->syscall_whitelist ? SCMP_ACT_ALLOW : negative_action;
- SET_FOREACH(id, c->syscall_filter, i) {
- r = seccomp_rule_add(seccomp, action, PTR_TO_INT(id) - 1, 0);
- if (r < 0)
- goto finish;
- }
+ return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action);
+}
+
+static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
+ assert(u);
+ assert(c);
- r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
- if (r < 0)
- goto finish;
+ if (set_isempty(c->syscall_archs))
+ return 0;
- r = seccomp_load(seccomp);
+ if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
+ return 0;
-finish:
- seccomp_release(seccomp);
- return r;
+ return seccomp_restrict_archs(c->syscall_archs);
}
static int apply_address_families(const Unit* u, const ExecContext *c) {
- scmp_filter_ctx seccomp;
- Iterator i;
- int r;
-
+ assert(u);
assert(c);
+ if (!context_has_address_families(c))
+ return 0;
+
if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
return 0;
- r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW);
- if (r < 0)
- return r;
-
- if (c->address_families_whitelist) {
- int af, first = 0, last = 0;
- void *afp;
-
- /* If this is a whitelist, we first block the address
- * families that are out of range and then everything
- * that is not in the set. First, we find the lowest
- * and highest address family in the set. */
-
- SET_FOREACH(afp, c->address_families, i) {
- af = PTR_TO_INT(afp);
-
- if (af <= 0 || af >= af_max())
- continue;
-
- if (first == 0 || af < first)
- first = af;
-
- if (last == 0 || af > last)
- last = af;
- }
-
- assert((first == 0) == (last == 0));
-
- if (first == 0) {
-
- /* No entries in the valid range, block everything */
- r = seccomp_rule_add(
- seccomp,
- SCMP_ACT_ERRNO(EPROTONOSUPPORT),
- SCMP_SYS(socket),
- 0);
- if (r < 0)
- goto finish;
-
- } else {
-
- /* Block everything below the first entry */
- r = seccomp_rule_add(
- seccomp,
- SCMP_ACT_ERRNO(EPROTONOSUPPORT),
- SCMP_SYS(socket),
- 1,
- SCMP_A0(SCMP_CMP_LT, first));
- if (r < 0)
- goto finish;
-
- /* Block everything above the last entry */
- r = seccomp_rule_add(
- seccomp,
- SCMP_ACT_ERRNO(EPROTONOSUPPORT),
- SCMP_SYS(socket),
- 1,
- SCMP_A0(SCMP_CMP_GT, last));
- if (r < 0)
- goto finish;
-
- /* Block everything between the first and last
- * entry */
- for (af = 1; af < af_max(); af++) {
-
- if (set_contains(c->address_families, INT_TO_PTR(af)))
- continue;
-
- r = seccomp_rule_add(
- seccomp,
- SCMP_ACT_ERRNO(EPROTONOSUPPORT),
- SCMP_SYS(socket),
- 1,
- SCMP_A0(SCMP_CMP_EQ, af));
- if (r < 0)
- goto finish;
- }
- }
-
- } else {
- void *af;
-
- /* If this is a blacklist, then generate one rule for
- * each address family that are then combined in OR
- * checks. */
-
- SET_FOREACH(af, c->address_families, i) {
-
- r = seccomp_rule_add(
- seccomp,
- SCMP_ACT_ERRNO(EPROTONOSUPPORT),
- SCMP_SYS(socket),
- 1,
- SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
- if (r < 0)
- goto finish;
- }
- }
-
- r = seccomp_load(seccomp);
-
-finish:
- seccomp_release(seccomp);
- return r;
+ return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
}
static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
- scmp_filter_ctx seccomp;
- int r;
-
+ assert(u);
assert(c);
+ if (!c->memory_deny_write_execute)
+ return 0;
+
if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
return 0;
- r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW);
- if (r < 0)
- return r;
-
- r = seccomp_rule_add(
- seccomp,
- SCMP_ACT_ERRNO(EPERM),
- SCMP_SYS(mmap),
- 1,
- SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
- if (r < 0)
- goto finish;
-
- r = seccomp_rule_add(
- seccomp,
- SCMP_ACT_ERRNO(EPERM),
- SCMP_SYS(mprotect),
- 1,
- SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
- if (r < 0)
- goto finish;
-
- r = seccomp_rule_add(
- seccomp,
- SCMP_ACT_ERRNO(EPERM),
- SCMP_SYS(shmat),
- 1,
- SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
- if (r < 0)
- goto finish;
-
- r = seccomp_load(seccomp);
-
-finish:
- seccomp_release(seccomp);
- return r;
+ return seccomp_memory_deny_write_execute();
}
static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
- static const int permitted_policies[] = {
- SCHED_OTHER,
- SCHED_BATCH,
- SCHED_IDLE,
- };
-
- scmp_filter_ctx seccomp;
- unsigned i;
- int r, p, max_policy = 0;
-
+ assert(u);
assert(c);
+ if (!c->restrict_realtime)
+ return 0;
+
if (skip_seccomp_unavailable(u, "RestrictRealtime="))
return 0;
- r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW);
- if (r < 0)
- return r;
-
- /* Determine the highest policy constant we want to allow */
- for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
- if (permitted_policies[i] > max_policy)
- max_policy = permitted_policies[i];
-
- /* Go through all policies with lower values than that, and block them -- unless they appear in the
- * whitelist. */
- for (p = 0; p < max_policy; p++) {
- bool good = false;
-
- /* Check if this is in the whitelist. */
- for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
- if (permitted_policies[i] == p) {
- good = true;
- break;
- }
-
- if (good)
- continue;
-
- /* Deny this policy */
- r = seccomp_rule_add(
- seccomp,
- SCMP_ACT_ERRNO(EPERM),
- SCMP_SYS(sched_setscheduler),
- 1,
- SCMP_A1(SCMP_CMP_EQ, p));
- if (r < 0)
- goto finish;
- }
-
- /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are unsigned here,
- * hence no need no check for < 0 values. */
- r = seccomp_rule_add(
- seccomp,
- SCMP_ACT_ERRNO(EPERM),
- SCMP_SYS(sched_setscheduler),
- 1,
- SCMP_A1(SCMP_CMP_GT, max_policy));
- if (r < 0)
- goto finish;
-
- r = seccomp_load(seccomp);
-
-finish:
- seccomp_release(seccomp);
- return r;
+ return seccomp_restrict_realtime();
}
static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
- scmp_filter_ctx seccomp;
- int r;
-
+ assert(u);
assert(c);
/* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
* let's protect even those systems where this is left on in the kernel. */
+ if (!c->protect_kernel_tunables)
+ return 0;
+
if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
return 0;
- r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW);
- if (r < 0)
- return r;
-
- r = seccomp_rule_add(
- seccomp,
- SCMP_ACT_ERRNO(EPERM),
- SCMP_SYS(_sysctl),
- 0);
- if (r < 0)
- goto finish;
-
- r = seccomp_load(seccomp);
-
-finish:
- seccomp_release(seccomp);
- return r;
+ return seccomp_protect_sysctl();
}
static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
+ assert(u);
assert(c);
/* Turn off module syscalls on ProtectKernelModules=yes */
+ if (!c->protect_kernel_modules)
+ return 0;
+
if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
return 0;
- return seccomp_load_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
+ return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
}
static int apply_private_devices(const Unit *u, const ExecContext *c) {
+ assert(u);
assert(c);
/* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
+ if (!c->private_devices)
+ return 0;
+
if (skip_seccomp_unavailable(u, "PrivateDevices="))
return 0;
- return seccomp_load_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
+ return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
}
static int apply_restrict_namespaces(Unit *u, const ExecContext *c) {
+ assert(u);
assert(c);
if (!exec_context_restrict_namespaces_set(c))
@@ -2168,40 +1990,6 @@ static int close_remaining_fds(
return close_all_fds(dont_close, n_dont_close);
}
-static bool context_has_address_families(const ExecContext *c) {
- assert(c);
-
- return c->address_families_whitelist ||
- !set_isempty(c->address_families);
-}
-
-static bool context_has_syscall_filters(const ExecContext *c) {
- assert(c);
-
- return c->syscall_whitelist ||
- !set_isempty(c->syscall_filter) ||
- !set_isempty(c->syscall_archs);
-}
-
-static bool context_has_no_new_privileges(const ExecContext *c) {
- assert(c);
-
- if (c->no_new_privileges)
- return true;
-
- if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
- return false;
-
- return context_has_address_families(c) || /* we need NNP if we have any form of seccomp and are unprivileged */
- c->memory_deny_write_execute ||
- c->restrict_realtime ||
- exec_context_restrict_namespaces_set(c) ||
- c->protect_kernel_tunables ||
- c->protect_kernel_modules ||
- c->private_devices ||
- context_has_syscall_filters(c);
-}
-
static int send_user_lookup(
Unit *unit,
int user_lookup_fd,
@@ -2753,28 +2541,22 @@ static int exec_child(
}
#ifdef HAVE_SECCOMP
- if (context_has_address_families(context)) {
- r = apply_address_families(unit, context);
- if (r < 0) {
- *exit_status = EXIT_ADDRESS_FAMILIES;
- return r;
- }
+ r = apply_address_families(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_ADDRESS_FAMILIES;
+ return r;
}
- if (context->memory_deny_write_execute) {
- r = apply_memory_deny_write_execute(unit, context);
- if (r < 0) {
- *exit_status = EXIT_SECCOMP;
- return r;
- }
+ r = apply_memory_deny_write_execute(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return r;
}
- if (context->restrict_realtime) {
- r = apply_restrict_realtime(unit, context);
- if (r < 0) {
- *exit_status = EXIT_SECCOMP;
- return r;
- }
+ r = apply_restrict_realtime(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return r;
}
r = apply_restrict_namespaces(unit, context);
@@ -2783,38 +2565,36 @@ static int exec_child(
return r;
}
- if (context->protect_kernel_tunables) {
- r = apply_protect_sysctl(unit, context);
- if (r < 0) {
- *exit_status = EXIT_SECCOMP;
- return r;
- }
+ r = apply_protect_sysctl(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return r;
}
- if (context->protect_kernel_modules) {
- r = apply_protect_kernel_modules(unit, context);
- if (r < 0) {
- *exit_status = EXIT_SECCOMP;
- return r;
- }
+ r = apply_protect_kernel_modules(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return r;
}
- if (context->private_devices) {
- r = apply_private_devices(unit, context);
- if (r < 0) {
- *exit_status = EXIT_SECCOMP;
- return r;
- }
+ r = apply_private_devices(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return r;
+ }
+
+ r = apply_syscall_archs(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return r;
}
/* This really should remain the last step before the execve(), to make sure our own code is unaffected
* by the filter as little as possible. */
- if (context_has_syscall_filters(context)) {
- r = apply_seccomp(unit, context);
- if (r < 0) {
- *exit_status = EXIT_SECCOMP;
- return r;
- }
+ r = apply_syscall_filter(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return r;
}
#endif
}
diff --git a/src/core/main.c b/src/core/main.c
index 94602611a7..fc1ae123a8 100644
--- a/src/core/main.c
+++ b/src/core/main.c
@@ -1185,44 +1185,16 @@ oom:
static int enforce_syscall_archs(Set *archs) {
#ifdef HAVE_SECCOMP
- scmp_filter_ctx *seccomp;
- Iterator i;
- void *id;
int r;
if (!is_seccomp_available())
return 0;
- seccomp = seccomp_init(SCMP_ACT_ALLOW);
- if (!seccomp)
- return log_oom();
-
- SET_FOREACH(id, arg_syscall_archs, i) {
- r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
- if (r == -EEXIST)
- continue;
- if (r < 0) {
- log_error_errno(r, "Failed to add architecture to seccomp: %m");
- goto finish;
- }
- }
-
- r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
- if (r < 0) {
- log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
- goto finish;
- }
-
- r = seccomp_load(seccomp);
+ r = seccomp_restrict_archs(arg_syscall_archs);
if (r < 0)
- log_error_errno(r, "Failed to add install architecture seccomp: %m");
-
-finish:
- seccomp_release(seccomp);
- return r;
-#else
- return 0;
+ return log_error_errno(r, "Failed to enforce system call architecture restrication: %m");
#endif
+ return 0;
}
static int status_welcome(void) {
diff --git a/src/nspawn/nspawn-seccomp.c b/src/nspawn/nspawn-seccomp.c
index 03a397d30c..72ecc51b16 100644
--- a/src/nspawn/nspawn-seccomp.c
+++ b/src/nspawn/nspawn-seccomp.c
@@ -26,20 +26,21 @@
#include <seccomp.h>
#endif
+#include "alloc-util.h"
#include "log.h"
-
-#ifdef HAVE_SECCOMP
-#include "seccomp-util.h"
-#endif
-
#include "nspawn-seccomp.h"
+#ifdef HAVE_SECCOMP
+#include "seccomp-util.h"
+#endif
+#include "string-util.h"
#ifdef HAVE_SECCOMP
-static int seccomp_add_default_syscall_filter(scmp_filter_ctx ctx,
- uint64_t cap_list_retain) {
- unsigned i;
- int r;
+static int seccomp_add_default_syscall_filter(
+ scmp_filter_ctx ctx,
+ uint32_t arch,
+ uint64_t cap_list_retain) {
+
static const struct {
uint64_t capability;
int syscall_num;
@@ -111,23 +112,29 @@ static int seccomp_add_default_syscall_filter(scmp_filter_ctx ctx,
{ CAP_SYS_TIME, SCMP_SYS(settimeofday) },
{ CAP_SYS_TIME, SCMP_SYS(stime) },
};
+ unsigned i;
+ int r, c = 0;
for (i = 0; i < ELEMENTSOF(blacklist); i++) {
if (blacklist[i].capability != 0 && (cap_list_retain & (1ULL << blacklist[i].capability)))
continue;
- r = seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
- if (r == -EFAULT)
- continue; /* unknown syscall */
- if (r < 0)
- return log_error_errno(r, "Failed to block syscall: %m");
+ r = seccomp_rule_add_exact(ctx, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
+ if (r < 0) {
+ /* If the system call is not known on this architecture, then that's fine, let's ignore it */
+ _cleanup_free_ char *n = NULL;
+
+ n = seccomp_syscall_resolve_num_arch(arch, blacklist[i].syscall_num);
+ log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", strna(n));
+ } else
+ c++;
}
- return 0;
+ return c;
}
int setup_seccomp(uint64_t cap_list_retain) {
- scmp_filter_ctx seccomp;
+ uint32_t arch;
int r;
if (!is_seccomp_available()) {
@@ -135,45 +142,51 @@ int setup_seccomp(uint64_t cap_list_retain) {
return 0;
}
- r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW);
- if (r < 0)
- return log_error_errno(r, "Failed to allocate seccomp object: %m");
-
- r = seccomp_add_default_syscall_filter(seccomp, cap_list_retain);
- if (r < 0)
- goto finish;
-
- /*
- Audit is broken in containers, much of the userspace audit
- hookup will fail if running inside a container. We don't
- care and just turn off creation of audit sockets.
-
- This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
- with EAFNOSUPPORT which audit userspace uses as indication
- that audit is disabled in the kernel.
- */
-
- r = seccomp_rule_add(
- seccomp,
- SCMP_ACT_ERRNO(EAFNOSUPPORT),
- SCMP_SYS(socket),
- 2,
- SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
- SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
- if (r < 0) {
- log_error_errno(r, "Failed to add audit seccomp rule: %m");
- goto finish;
- }
+ SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+ int n;
+
+ log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
+
+ r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+ if (r < 0)
+ return log_error_errno(r, "Failed to allocate seccomp object: %m");
+
+ n = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain);
+ if (n < 0)
+ return n;
+
+ /*
+ Audit is broken in containers, much of the userspace audit hookup will fail if running inside a
+ container. We don't care and just turn off creation of audit sockets.
+
+ This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail with EAFNOSUPPORT which audit userspace uses
+ as indication that audit is disabled in the kernel.
+ */
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EAFNOSUPPORT),
+ SCMP_SYS(socket),
+ 2,
+ SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
+ SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
+ if (r < 0)
+ log_debug_errno(r, "Failed to add audit seccomp rule, ignoring: %m");
+ else
+ n++;
+
+ if (n <= 0) /* no rule added? then skip this architecture */
+ continue;
- r = seccomp_load(seccomp);
- if (r < 0) {
- log_error_errno(r, "Failed to install seccomp audit filter: %m");
- goto finish;
+ r = seccomp_load(seccomp);
+ if (IN_SET(r, -EPERM, -EACCES))
+ return log_error_errno(r, "Failed to install seccomp audit filter: %m");
+ if (r < 0)
+ log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
}
-finish:
- seccomp_release(seccomp);
- return r;
+ return 0;
}
#else
diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c
index 55b97e1efb..aa37e12db7 100644
--- a/src/shared/seccomp-util.c
+++ b/src/shared/seccomp-util.c
@@ -18,17 +18,52 @@
***/
#include <errno.h>
+#include <linux/seccomp.h>
#include <seccomp.h>
#include <stddef.h>
+#include <sys/mman.h>
#include <sys/prctl.h>
-#include <linux/seccomp.h>
+#include <sys/shm.h>
+#include "af-list.h"
#include "alloc-util.h"
#include "macro.h"
#include "nsflags.h"
#include "seccomp-util.h"
#include "string-util.h"
#include "util.h"
+#include "errno-list.h"
+
+const uint32_t seccomp_local_archs[] = {
+
+#if defined(__i386__) || defined(__x86_64__)
+ SCMP_ARCH_X86,
+ SCMP_ARCH_X86_64,
+ SCMP_ARCH_X32,
+
+#elif defined(__arm__) || defined(__aarch64__)
+ SCMP_ARCH_ARM,
+ SCMP_ARCH_AARCH64,
+
+#elif defined(__mips__) || defined(__mips64__)
+ SCMP_ARCH_MIPS,
+ SCMP_ARCH_MIPS64,
+ SCMP_ARCH_MIPS64N32,
+ SCMP_ARCH_MIPSEL,
+ SCMP_ARCH_MIPSEL64,
+ SCMP_ARCH_MIPSEL64N32,
+
+#elif defined(__powerpc__) || defined(__powerpc64__)
+ SCMP_ARCH_PPC,
+ SCMP_ARCH_PPC64,
+ SCMP_ARCH_PPC64LE,
+
+#elif defined(__s390__) || defined(__s390x__)
+ SCMP_ARCH_S390,
+ SCMP_ARCH_S390X,
+#endif
+ (uint32_t) -1
+ };
const char* seccomp_arch_to_string(uint32_t c) {
/* Maintain order used in <seccomp.h>.
@@ -122,18 +157,37 @@ int seccomp_arch_from_string(const char *n, uint32_t *ret) {
return 0;
}
-int seccomp_init_conservative(scmp_filter_ctx *ret, uint32_t default_action) {
+int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
scmp_filter_ctx seccomp;
int r;
- /* Much like seccomp_init(), but tries to be a bit more conservative in its defaults: all secondary archs are
- * added by default, and NNP is turned off. */
+ /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
+ * any others. Also, turns off the NNP fiddling. */
seccomp = seccomp_init(default_action);
if (!seccomp)
return -ENOMEM;
- r = seccomp_add_secondary_archs(seccomp);
+ if (arch != SCMP_ARCH_NATIVE &&
+ arch != seccomp_arch_native()) {
+
+ r = seccomp_arch_add(seccomp, arch);
+ if (r < 0)
+ goto finish;
+
+ r = seccomp_arch_remove(seccomp, seccomp_arch_native());
+ if (r < 0)
+ goto finish;
+
+ assert(seccomp_arch_exist(seccomp, arch) >= 0);
+ assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
+ assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
+ } else {
+ assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
+ assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
+ }
+
+ r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
if (r < 0)
goto finish;
@@ -149,56 +203,6 @@ finish:
return r;
}
-int seccomp_add_secondary_archs(scmp_filter_ctx ctx) {
-
- /* Add in all possible secondary archs we are aware of that
- * this kernel might support. */
-
- static const int seccomp_arches[] = {
-#if defined(__i386__) || defined(__x86_64__)
- SCMP_ARCH_X86,
- SCMP_ARCH_X86_64,
- SCMP_ARCH_X32,
-
-#elif defined(__arm__) || defined(__aarch64__)
- SCMP_ARCH_ARM,
- SCMP_ARCH_AARCH64,
-
-#elif defined(__arm__) || defined(__aarch64__)
- SCMP_ARCH_ARM,
- SCMP_ARCH_AARCH64,
-
-#elif defined(__mips__) || defined(__mips64__)
- SCMP_ARCH_MIPS,
- SCMP_ARCH_MIPS64,
- SCMP_ARCH_MIPS64N32,
- SCMP_ARCH_MIPSEL,
- SCMP_ARCH_MIPSEL64,
- SCMP_ARCH_MIPSEL64N32,
-
-#elif defined(__powerpc__) || defined(__powerpc64__)
- SCMP_ARCH_PPC,
- SCMP_ARCH_PPC64,
- SCMP_ARCH_PPC64LE,
-
-#elif defined(__s390__) || defined(__s390x__)
- SCMP_ARCH_S390,
- SCMP_ARCH_S390X,
-#endif
- };
-
- unsigned i;
- int r;
-
- for (i = 0; i < ELEMENTSOF(seccomp_arches); i++) {
- r = seccomp_arch_add(ctx, seccomp_arches[i]);
- if (r < 0 && r != -EEXIST)
- return r;
- }
-
- return 0;
-}
-
static bool is_basic_seccomp_available(void) {
int r;
r = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
@@ -523,7 +527,12 @@ const SyscallFilterSet *syscall_filter_set_find(const char *name) {
return NULL;
}
-int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action) {
+static int seccomp_add_syscall_filter_set(
+ scmp_filter_ctx seccomp,
+ uint32_t default_action,
+ const SyscallFilterSet *set,
+ uint32_t action) {
+
const char *sys;
int r;
@@ -540,47 +549,102 @@ int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterS
if (!other)
return -EINVAL;
- r = seccomp_add_syscall_filter_set(seccomp, other, action);
+ r = seccomp_add_syscall_filter_set(seccomp, default_action, other, action);
+ if (r < 0)
+ return r;
} else {
id = seccomp_syscall_resolve_name(sys);
if (id == __NR_SCMP_ERROR)
- return -EINVAL;
+ return -EINVAL; /* Not known at all? Then that's a real error */
- r = seccomp_rule_add(seccomp, action, id, 0);
+ r = seccomp_rule_add_exact(seccomp, action, id, 0);
+ if (r < 0)
+ /* If the system call is not known on this architecture, then that's fine, let's ignore it */
+ log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", sys);
}
- if (r < 0)
- return r;
}
return 0;
}
-int seccomp_load_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action) {
- scmp_filter_ctx seccomp;
+int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action) {
+ uint32_t arch;
int r;
assert(set);
- /* The one-stop solution: allocate a seccomp object, add a filter to it, and apply it */
+ /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
+ * earch local arch. */
- r = seccomp_init_conservative(&seccomp, default_action);
- if (r < 0)
- return r;
+ SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
- r = seccomp_add_syscall_filter_set(seccomp, set, action);
- if (r < 0)
- goto finish;
+ log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
- r = seccomp_load(seccomp);
+ r = seccomp_init_for_arch(&seccomp, arch, default_action);
+ if (r < 0)
+ return r;
-finish:
- seccomp_release(seccomp);
- return r;
+ r = seccomp_add_syscall_filter_set(seccomp, default_action, set, action);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add filter set, ignoring: %m");
+ continue;
+ }
+
+ r = seccomp_load(seccomp);
+ if (IN_SET(r, -EPERM, -EACCES))
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ }
+
+ return 0;
+}
+
+int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Set* set, uint32_t action) {
+ uint32_t arch;
+ int r;
+
+ /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
+ * SyscallFilterSet* table. */
+
+ if (set_isempty(set) && default_action == SCMP_ACT_ALLOW)
+ return 0;
+
+ SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+ Iterator i;
+ void *id;
+
+ log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
+
+ r = seccomp_init_for_arch(&seccomp, arch, default_action);
+ if (r < 0)
+ return r;
+
+ SET_FOREACH(id, set, i) {
+ r = seccomp_rule_add_exact(seccomp, action, PTR_TO_INT(id) - 1, 0);
+ if (r < 0) {
+ /* If the system call is not known on this architecture, then that's fine, let's ignore it */
+ _cleanup_free_ char *n = NULL;
+
+ n = seccomp_syscall_resolve_num_arch(arch, PTR_TO_INT(id) - 1);
+ log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", strna(n));
+ }
+ }
+
+ r = seccomp_load(seccomp);
+ if (IN_SET(r, -EPERM, -EACCES))
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ }
+
+ return 0;
}
int seccomp_restrict_namespaces(unsigned long retain) {
- scmp_filter_ctx seccomp;
- unsigned i;
+ uint32_t arch;
int r;
if (log_get_max_level() >= LOG_DEBUG) {
@@ -594,74 +658,420 @@ int seccomp_restrict_namespaces(unsigned long retain) {
if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
return 0;
- r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW);
- if (r < 0)
- return r;
-
- if ((retain & NAMESPACE_FLAGS_ALL) == 0)
- /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
- * altogether. */
- r = seccomp_rule_add(
+ SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+ unsigned i;
+
+ log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
+
+ r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+ if (r < 0)
+ return r;
+
+ if ((retain & NAMESPACE_FLAGS_ALL) == 0)
+ /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
+ * altogether. */
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(setns),
+ 0);
+ else
+ /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
+ * special invocation with a zero flags argument, right here. */
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(setns),
+ 1,
+ SCMP_A1(SCMP_CMP_EQ, 0));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ continue;
+ }
+
+ for (i = 0; namespace_flag_map[i].name; i++) {
+ unsigned long f;
+
+ f = namespace_flag_map[i].flag;
+ if ((retain & f) == f) {
+ log_debug("Permitting %s.", namespace_flag_map[i].name);
+ continue;
+ }
+
+ log_debug("Blocking %s.", namespace_flag_map[i].name);
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(unshare),
+ 1,
+ SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ break;
+ }
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(clone),
+ 1,
+ SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ break;
+ }
+
+ if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(setns),
+ 1,
+ SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ break;
+ }
+ }
+ }
+ if (r < 0)
+ continue;
+
+ r = seccomp_load(seccomp);
+ if (IN_SET(r, -EPERM, -EACCES))
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ }
+
+ return 0;
+}
+
+int seccomp_protect_sysctl(void) {
+ uint32_t arch;
+ int r;
+
+ SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+
+ log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
+
+ r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+ if (r < 0)
+ return r;
+
+ r = seccomp_rule_add_exact(
seccomp,
SCMP_ACT_ERRNO(EPERM),
- SCMP_SYS(setns),
+ SCMP_SYS(_sysctl),
0);
- else
- /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
- * special invocation with a zero flags argument, right here. */
- r = seccomp_rule_add(
- seccomp,
- SCMP_ACT_ERRNO(EPERM),
- SCMP_SYS(setns),
- 1,
- SCMP_A1(SCMP_CMP_EQ, 0));
- if (r < 0)
- goto finish;
-
- for (i = 0; namespace_flag_map[i].name; i++) {
- unsigned long f;
-
- f = namespace_flag_map[i].flag;
- if ((retain & f) == f) {
- log_debug("Permitting %s.", namespace_flag_map[i].name);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
continue;
}
- log_debug("Blocking %s.", namespace_flag_map[i].name);
-
- r = seccomp_rule_add(
- seccomp,
- SCMP_ACT_ERRNO(EPERM),
- SCMP_SYS(unshare),
- 1,
- SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
+ r = seccomp_load(seccomp);
+ if (IN_SET(r, -EPERM, -EACCES))
+ return r;
if (r < 0)
- goto finish;
-
- r = seccomp_rule_add(
- seccomp,
- SCMP_ACT_ERRNO(EPERM),
- SCMP_SYS(clone),
- 1,
- SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
+ log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ }
+
+ return 0;
+}
+
+int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
+ uint32_t arch;
+ int r;
+
+ SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+ Iterator i;
+
+ log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
+
+ r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
if (r < 0)
- goto finish;
+ return r;
+
+ if (whitelist) {
+ int af, first = 0, last = 0;
+ void *afp;
+
+ /* If this is a whitelist, we first block the address families that are out of range and then
+ * everything that is not in the set. First, we find the lowest and highest address family in
+ * the set. */
+
+ SET_FOREACH(afp, address_families, i) {
+ af = PTR_TO_INT(afp);
+
+ if (af <= 0 || af >= af_max())
+ continue;
+
+ if (first == 0 || af < first)
+ first = af;
+
+ if (last == 0 || af > last)
+ last = af;
+ }
+
+ assert((first == 0) == (last == 0));
+
+ if (first == 0) {
+
+ /* No entries in the valid range, block everything */
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EAFNOSUPPORT),
+ SCMP_SYS(socket),
+ 0);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ continue;
+ }
+
+ } else {
+
+ /* Block everything below the first entry */
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EAFNOSUPPORT),
+ SCMP_SYS(socket),
+ 1,
+ SCMP_A0(SCMP_CMP_LT, first));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ continue;
+ }
+
+ /* Block everything above the last entry */
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EAFNOSUPPORT),
+ SCMP_SYS(socket),
+ 1,
+ SCMP_A0(SCMP_CMP_GT, last));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ continue;
+ }
+
+ /* Block everything between the first and last entry */
+ for (af = 1; af < af_max(); af++) {
+
+ if (set_contains(address_families, INT_TO_PTR(af)))
+ continue;
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EAFNOSUPPORT),
+ SCMP_SYS(socket),
+ 1,
+ SCMP_A0(SCMP_CMP_EQ, af));
+ if (r < 0)
+ break;
+ }
+
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ continue;
+ }
+ }
+
+ } else {
+ void *af;
+
+ /* If this is a blacklist, then generate one rule for
+ * each address family that are then combined in OR
+ * checks. */
+
+ SET_FOREACH(af, address_families, i) {
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EAFNOSUPPORT),
+ SCMP_SYS(socket),
+ 1,
+ SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
+ if (r < 0)
+ break;
+ }
+
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ continue;
+ }
+ }
+
+ r = seccomp_load(seccomp);
+ if (IN_SET(r, -EPERM, -EACCES))
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ }
+
+ return 0;
+}
+
+int seccomp_restrict_realtime(void) {
+ static const int permitted_policies[] = {
+ SCHED_OTHER,
+ SCHED_BATCH,
+ SCHED_IDLE,
+ };
+
+ int r, max_policy = 0;
+ uint32_t arch;
+ unsigned i;
+
+ /* Determine the highest policy constant we want to allow */
+ for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
+ if (permitted_policies[i] > max_policy)
+ max_policy = permitted_policies[i];
+
+ SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+ int p;
+
+ log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
+
+ r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+ if (r < 0)
+ return r;
+
+ /* Go through all policies with lower values than that, and block them -- unless they appear in the
+ * whitelist. */
+ for (p = 0; p < max_policy; p++) {
+ bool good = false;
+
+ /* Check if this is in the whitelist. */
+ for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
+ if (permitted_policies[i] == p) {
+ good = true;
+ break;
+ }
+
+ if (good)
+ continue;
- if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
- r = seccomp_rule_add(
+ /* Deny this policy */
+ r = seccomp_rule_add_exact(
seccomp,
SCMP_ACT_ERRNO(EPERM),
- SCMP_SYS(setns),
+ SCMP_SYS(sched_setscheduler),
1,
- SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
- if (r < 0)
- goto finish;
+ SCMP_A1(SCMP_CMP_EQ, p));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ continue;
+ }
}
+
+ /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
+ * unsigned here, hence no need no check for < 0 values. */
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(sched_setscheduler),
+ 1,
+ SCMP_A1(SCMP_CMP_GT, max_policy));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ continue;
+ }
+
+ r = seccomp_load(seccomp);
+ if (IN_SET(r, -EPERM, -EACCES))
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ }
+
+ return 0;
+}
+
+int seccomp_memory_deny_write_execute(void) {
+ uint32_t arch;
+ int r;
+
+ SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+
+ log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
+
+ r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+ if (r < 0)
+ return r;
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(mmap),
+ 1,
+ SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add mmap() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ continue;
+ }
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(mprotect),
+ 1,
+ SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add mprotect() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ continue;
+ }
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(shmat),
+ 1,
+ SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add shmat() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ continue;
+ }
+
+ r = seccomp_load(seccomp);
+ if (IN_SET(r, -EPERM, -EACCES))
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ }
+
+ return 0;
+}
+
+int seccomp_restrict_archs(Set *archs) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+ Iterator i;
+ void *id;
+ int r;
+
+ /* This installs a filter with no rules, but that restricts the system call architectures to the specified
+ * list. */
+
+ seccomp = seccomp_init(SCMP_ACT_ALLOW);
+ if (!seccomp)
+ return -ENOMEM;
+
+ SET_FOREACH(id, archs, i) {
+ r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
+ if (r == -EEXIST)
+ continue;
+ if (r < 0)
+ return r;
}
- r = seccomp_load(seccomp);
+ r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
+ if (r < 0)
+ return r;
-finish:
- seccomp_release(seccomp);
- return r;
+ return seccomp_load(seccomp);
}
diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h
index e325dab628..50e4f43c43 100644
--- a/src/shared/seccomp-util.h
+++ b/src/shared/seccomp-util.h
@@ -23,12 +23,12 @@
#include <stdbool.h>
#include <stdint.h>
+#include "set.h"
+
const char* seccomp_arch_to_string(uint32_t c);
int seccomp_arch_from_string(const char *n, uint32_t *ret);
-int seccomp_init_conservative(scmp_filter_ctx *ret, uint32_t default_action);
-
-int seccomp_add_secondary_archs(scmp_filter_ctx c);
+int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action);
bool is_seccomp_available(void);
@@ -61,8 +61,21 @@ extern const SyscallFilterSet syscall_filter_sets[];
const SyscallFilterSet *syscall_filter_set_find(const char *name);
-int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action);
-
-int seccomp_load_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action);
+int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action);
+int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Set* set, uint32_t action);
+int seccomp_restrict_archs(Set *archs);
int seccomp_restrict_namespaces(unsigned long retain);
+int seccomp_protect_sysctl(void);
+int seccomp_restrict_address_families(Set *address_families, bool whitelist);
+int seccomp_restrict_realtime(void);
+int seccomp_memory_deny_write_execute(void);
+
+extern const uint32_t seccomp_local_archs[];
+
+#define SECCOMP_FOREACH_LOCAL_ARCH(arch) \
+ for (unsigned _i = ({ (arch) = seccomp_local_archs[0]; 0; }); \
+ seccomp_local_archs[_i] != (uint32_t) -1; \
+ (arch) = seccomp_local_archs[++_i])
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(scmp_filter_ctx, seccomp_release);
diff --git a/src/test/test-execute.c b/src/test/test-execute.c
index 6029853e3e..7d7790cf1e 100644
--- a/src/test/test-execute.c
+++ b/src/test/test-execute.c
@@ -457,6 +457,7 @@ int main(int argc, char *argv[]) {
};
int r;
+ log_set_max_level(LOG_DEBUG);
log_parse_environment();
log_open();
diff --git a/src/test/test-seccomp.c b/src/test/test-seccomp.c
index beb6a7f422..6f15879c45 100644
--- a/src/test/test-seccomp.c
+++ b/src/test/test-seccomp.c
@@ -17,10 +17,12 @@
along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/
+#include <sched.h>
#include <stdlib.h>
#include <sys/eventfd.h>
+#include <sys/mman.h>
#include <unistd.h>
-#include <sched.h>
+#include <sys/poll.h>
#include "alloc-util.h"
#include "fd-util.h"
@@ -30,8 +32,10 @@
#include "process-util.h"
#include "raw-clone.h"
#include "seccomp-util.h"
+#include "set.h"
#include "string-util.h"
#include "util.h"
+#include "virt.h"
static void test_seccomp_arch_to_string(void) {
uint32_t a, b;
@@ -92,7 +96,6 @@ static void test_filter_sets(void) {
if (!is_seccomp_available())
return;
-
if (geteuid() != 0)
return;
@@ -108,16 +111,16 @@ static void test_filter_sets(void) {
int fd;
if (i == SYSCALL_FILTER_SET_DEFAULT) /* if we look at the default set, whitelist instead of blacklist */
- r = seccomp_load_filter_set(SCMP_ACT_ERRNO(EPERM), syscall_filter_sets + i, SCMP_ACT_ALLOW);
+ r = seccomp_load_syscall_filter_set(SCMP_ACT_ERRNO(EUCLEAN), syscall_filter_sets + i, SCMP_ACT_ALLOW);
else
- r = seccomp_load_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + i, SCMP_ACT_ERRNO(EPERM));
+ r = seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + i, SCMP_ACT_ERRNO(EUCLEAN));
if (r < 0)
_exit(EXIT_FAILURE);
/* Test the sycall filter with one random system call */
fd = eventfd(0, EFD_NONBLOCK|EFD_CLOEXEC);
if (IN_SET(i, SYSCALL_FILTER_SET_IO_EVENT, SYSCALL_FILTER_SET_DEFAULT))
- assert_se(fd < 0 && errno == EPERM);
+ assert_se(fd < 0 && errno == EUCLEAN);
else {
assert_se(fd >= 0);
safe_close(fd);
@@ -132,8 +135,8 @@ static void test_filter_sets(void) {
static void test_restrict_namespace(void) {
_cleanup_free_ char *s = NULL;
- pid_t pid;
unsigned long ul;
+ pid_t pid;
assert_se(namespace_flag_to_string(0) == NULL);
assert_se(streq(namespace_flag_to_string(CLONE_NEWNS), "mnt"));
@@ -157,7 +160,6 @@ static void test_restrict_namespace(void) {
if (!is_seccomp_available())
return;
-
if (geteuid() != 0)
return;
@@ -216,6 +218,256 @@ static void test_restrict_namespace(void) {
assert_se(wait_for_terminate_and_warn("nsseccomp", pid, true) == EXIT_SUCCESS);
}
+static void test_protect_sysctl(void) {
+ pid_t pid;
+
+ if (!is_seccomp_available())
+ return;
+ if (geteuid() != 0)
+ return;
+
+ if (detect_container() > 0) /* in containers _sysctl() is likely missing anyway */
+ return;
+
+ pid = fork();
+ assert_se(pid >= 0);
+
+ if (pid == 0) {
+ assert_se(syscall(__NR__sysctl, NULL) < 0);
+ assert_se(errno == EFAULT);
+
+ assert_se(seccomp_protect_sysctl() >= 0);
+
+ assert_se(syscall(__NR__sysctl, 0, 0, 0) < 0);
+ assert_se(errno == EPERM);
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ assert_se(wait_for_terminate_and_warn("sysctlseccomp", pid, true) == EXIT_SUCCESS);
+}
+
+static void test_restrict_address_families(void) {
+ pid_t pid;
+
+ if (!is_seccomp_available())
+ return;
+ if (geteuid() != 0)
+ return;
+
+ pid = fork();
+ assert_se(pid >= 0);
+
+ if (pid == 0) {
+ int fd;
+ Set *s;
+
+ fd = socket(AF_INET, SOCK_DGRAM, 0);
+ assert_se(fd >= 0);
+ safe_close(fd);
+
+ fd = socket(AF_UNIX, SOCK_DGRAM, 0);
+ assert_se(fd >= 0);
+ safe_close(fd);
+
+ fd = socket(AF_NETLINK, SOCK_DGRAM, 0);
+ assert_se(fd >= 0);
+ safe_close(fd);
+
+ assert_se(s = set_new(NULL));
+ assert_se(set_put(s, INT_TO_PTR(AF_UNIX)) >= 0);
+
+ assert_se(seccomp_restrict_address_families(s, false) >= 0);
+
+ fd = socket(AF_INET, SOCK_DGRAM, 0);
+ assert_se(fd >= 0);
+ safe_close(fd);
+
+ assert_se(socket(AF_UNIX, SOCK_DGRAM, 0) < 0);
+ assert_se(errno == EAFNOSUPPORT);
+
+ fd = socket(AF_NETLINK, SOCK_DGRAM, 0);
+ assert_se(fd >= 0);
+ safe_close(fd);
+
+ set_clear(s);
+
+ assert_se(set_put(s, INT_TO_PTR(AF_INET)) >= 0);
+
+ assert_se(seccomp_restrict_address_families(s, true) >= 0);
+
+ fd = socket(AF_INET, SOCK_DGRAM, 0);
+ assert_se(fd >= 0);
+ safe_close(fd);
+
+ assert_se(socket(AF_UNIX, SOCK_DGRAM, 0) < 0);
+ assert_se(errno == EAFNOSUPPORT);
+
+ assert_se(socket(AF_NETLINK, SOCK_DGRAM, 0) < 0);
+ assert_se(errno == EAFNOSUPPORT);
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ assert_se(wait_for_terminate_and_warn("socketseccomp", pid, true) == EXIT_SUCCESS);
+}
+
+static void test_restrict_realtime(void) {
+ pid_t pid;
+
+ if (!is_seccomp_available())
+ return;
+ if (geteuid() != 0)
+ return;
+
+ if (detect_container() > 0) /* in containers RT privs are likely missing anyway */
+ return;
+
+ pid = fork();
+ assert_se(pid >= 0);
+
+ if (pid == 0) {
+ assert_se(sched_setscheduler(0, SCHED_FIFO, &(struct sched_param) { .sched_priority = 1 }) >= 0);
+ assert_se(sched_setscheduler(0, SCHED_RR, &(struct sched_param) { .sched_priority = 1 }) >= 0);
+ assert_se(sched_setscheduler(0, SCHED_IDLE, &(struct sched_param) { .sched_priority = 0 }) >= 0);
+ assert_se(sched_setscheduler(0, SCHED_BATCH, &(struct sched_param) { .sched_priority = 0 }) >= 0);
+ assert_se(sched_setscheduler(0, SCHED_OTHER, &(struct sched_param) {}) >= 0);
+
+ assert_se(seccomp_restrict_realtime() >= 0);
+
+ assert_se(sched_setscheduler(0, SCHED_IDLE, &(struct sched_param) { .sched_priority = 0 }) >= 0);
+ assert_se(sched_setscheduler(0, SCHED_BATCH, &(struct sched_param) { .sched_priority = 0 }) >= 0);
+ assert_se(sched_setscheduler(0, SCHED_OTHER, &(struct sched_param) {}) >= 0);
+
+ assert_se(sched_setscheduler(0, SCHED_FIFO, &(struct sched_param) { .sched_priority = 1 }) < 0);
+ assert_se(errno == EPERM);
+ assert_se(sched_setscheduler(0, SCHED_RR, &(struct sched_param) { .sched_priority = 1 }) < 0);
+ assert_se(errno == EPERM);
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ assert_se(wait_for_terminate_and_warn("realtimeseccomp", pid, true) == EXIT_SUCCESS);
+}
+
+static void test_memory_deny_write_execute(void) {
+ pid_t pid;
+
+ if (!is_seccomp_available())
+ return;
+ if (geteuid() != 0)
+ return;
+
+ pid = fork();
+ assert_se(pid >= 0);
+
+ if (pid == 0) {
+ void *p;
+
+ p = mmap(NULL, page_size(), PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, -1,0);
+ assert_se(p != MAP_FAILED);
+ assert_se(munmap(p, page_size()) >= 0);
+
+ seccomp_memory_deny_write_execute();
+
+ p = mmap(NULL, page_size(), PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, -1,0);
+ assert_se(p == MAP_FAILED);
+ assert_se(errno == EPERM);
+
+ p = mmap(NULL, page_size(), PROT_WRITE|PROT_READ, MAP_PRIVATE|MAP_ANONYMOUS, -1,0);
+ assert_se(p != MAP_FAILED);
+ assert_se(munmap(p, page_size()) >= 0);
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ assert_se(wait_for_terminate_and_warn("memoryseccomp", pid, true) == EXIT_SUCCESS);
+}
+
+static void test_restrict_archs(void) {
+ pid_t pid;
+
+ if (!is_seccomp_available())
+ return;
+ if (geteuid() != 0)
+ return;
+
+ pid = fork();
+ assert_se(pid >= 0);
+
+ if (pid == 0) {
+ _cleanup_set_free_ Set *s = NULL;
+
+ assert_se(access("/", F_OK) >= 0);
+
+ assert_se(s = set_new(NULL));
+
+#ifdef __x86_64__
+ assert_se(set_put(s, UINT32_TO_PTR(SCMP_ARCH_X86+1)) >= 0);
+#endif
+ assert_se(seccomp_restrict_archs(s) >= 0);
+
+ assert_se(access("/", F_OK) >= 0);
+ assert_se(seccomp_restrict_archs(NULL) >= 0);
+
+ assert_se(access("/", F_OK) >= 0);
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ assert_se(wait_for_terminate_and_warn("archseccomp", pid, true) == EXIT_SUCCESS);
+}
+
+static void test_load_syscall_filter_set_raw(void) {
+ pid_t pid;
+
+ if (!is_seccomp_available())
+ return;
+ if (geteuid() != 0)
+ return;
+
+ pid = fork();
+ assert_se(pid >= 0);
+
+ if (pid == 0) {
+ _cleanup_set_free_ Set *s = NULL;
+
+ assert_se(access("/", F_OK) >= 0);
+ assert_se(poll(NULL, 0, 0) == 0);
+
+ assert_se(seccomp_load_syscall_filter_set_raw(SCMP_ACT_ALLOW, NULL, SCMP_ACT_KILL) >= 0);
+ assert_se(access("/", F_OK) >= 0);
+ assert_se(poll(NULL, 0, 0) == 0);
+
+ assert_se(s = set_new(NULL));
+ assert_se(set_put(s, UINT32_TO_PTR(__NR_access + 1)) >= 0);
+
+ assert_se(seccomp_load_syscall_filter_set_raw(SCMP_ACT_ALLOW, s, SCMP_ACT_ERRNO(EUCLEAN)) >= 0);
+
+ assert_se(access("/", F_OK) < 0);
+ assert_se(errno == EUCLEAN);
+
+ assert_se(poll(NULL, 0, 0) == 0);
+
+ s = set_free(s);
+
+ assert_se(s = set_new(NULL));
+ assert_se(set_put(s, UINT32_TO_PTR(__NR_poll + 1)) >= 0);
+
+ assert_se(seccomp_load_syscall_filter_set_raw(SCMP_ACT_ALLOW, s, SCMP_ACT_ERRNO(EUNATCH)) >= 0);
+
+ assert_se(access("/", F_OK) < 0);
+ assert_se(errno == EUCLEAN);
+
+ assert_se(poll(NULL, 0, 0) < 0);
+ assert_se(errno == EUNATCH);
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ assert_se(wait_for_terminate_and_warn("syscallrawseccomp", pid, true) == EXIT_SUCCESS);
+}
+
int main(int argc, char *argv[]) {
log_set_max_level(LOG_DEBUG);
@@ -225,6 +477,12 @@ int main(int argc, char *argv[]) {
test_syscall_filter_set_find();
test_filter_sets();
test_restrict_namespace();
+ test_protect_sysctl();
+ test_restrict_address_families();
+ test_restrict_realtime();
+ test_memory_deny_write_execute();
+ test_restrict_archs();
+ test_load_syscall_filter_set_raw();
return 0;
}