kernel-ark/net/core/flow.c
James Morris 134b0fc544 IPsec: propagate security module errors up from flow_cache_lookup
When a security module is loaded (in this case, SELinux), the
security_xfrm_policy_lookup() hook can return an access denied permission
(or other error).  We were not handling that correctly, and in fact
inverting the return logic and propagating a false "ok" back up to
xfrm_lookup(), which then allowed packets to pass as if they were not
associated with an xfrm policy.

The way I was seeing the problem was when connecting via IPsec to a
confined service on an SELinux box (vsftpd), which did not have the
appropriate SELinux policy permissions to send packets via IPsec.

The first SYNACK would be blocked, because of an uncached lookup via
flow_cache_lookup(), which would fail to resolve an xfrm policy because
the SELinux policy is checked at that point via the resolver.

However, retransmitted SYNACKs would then find a cached flow entry when
calling into flow_cache_lookup() with a null xfrm policy, which is
interpreted by xfrm_lookup() as the packet not having any associated
policy and similarly to the first case, allowing it to pass without
transformation.

The solution presented here is to first ensure that errno values are
correctly propagated all the way back up through the various call chains
from security_xfrm_policy_lookup(), and handled correctly.

Then, flow_cache_lookup() is modified, so that if the policy resolver
fails (typically a permission denied via the security module), the flow
cache entry is killed rather than having a null policy assigned (which
indicates that the packet can pass freely).  This also forces any future
lookups for the same flow to consult the security module (e.g. SELinux)
for current security policy (rather than, say, caching the error on the
flow cache entry).

Signed-off-by: James Morris <jmorris@namei.org>
2006-10-11 23:59:34 -07:00

382 lines
8.3 KiB
C

/* flow.c: Generic flow cache.
*
* Copyright (C) 2003 Alexey N. Kuznetsov (kuznet@ms2.inr.ac.ru)
* Copyright (C) 2003 David S. Miller (davem@redhat.com)
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/list.h>
#include <linux/jhash.h>
#include <linux/interrupt.h>
#include <linux/mm.h>
#include <linux/random.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/smp.h>
#include <linux/completion.h>
#include <linux/percpu.h>
#include <linux/bitops.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/mutex.h>
#include <net/flow.h>
#include <asm/atomic.h>
#include <asm/semaphore.h>
#include <linux/security.h>
struct flow_cache_entry {
struct flow_cache_entry *next;
u16 family;
u8 dir;
struct flowi key;
u32 genid;
void *object;
atomic_t *object_ref;
};
atomic_t flow_cache_genid = ATOMIC_INIT(0);
static u32 flow_hash_shift;
#define flow_hash_size (1 << flow_hash_shift)
static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL };
#define flow_table(cpu) (per_cpu(flow_tables, cpu))
static kmem_cache_t *flow_cachep __read_mostly;
static int flow_lwm, flow_hwm;
struct flow_percpu_info {
int hash_rnd_recalc;
u32 hash_rnd;
int count;
} ____cacheline_aligned;
static DEFINE_PER_CPU(struct flow_percpu_info, flow_hash_info) = { 0 };
#define flow_hash_rnd_recalc(cpu) \
(per_cpu(flow_hash_info, cpu).hash_rnd_recalc)
#define flow_hash_rnd(cpu) \
(per_cpu(flow_hash_info, cpu).hash_rnd)
#define flow_count(cpu) \
(per_cpu(flow_hash_info, cpu).count)
static struct timer_list flow_hash_rnd_timer;
#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ)
struct flow_flush_info {
atomic_t cpuleft;
struct completion completion;
};
static DEFINE_PER_CPU(struct tasklet_struct, flow_flush_tasklets) = { NULL };
#define flow_flush_tasklet(cpu) (&per_cpu(flow_flush_tasklets, cpu))
static void flow_cache_new_hashrnd(unsigned long arg)
{
int i;
for_each_possible_cpu(i)
flow_hash_rnd_recalc(i) = 1;
flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
add_timer(&flow_hash_rnd_timer);
}
static void flow_entry_kill(int cpu, struct flow_cache_entry *fle)
{
if (fle->object)
atomic_dec(fle->object_ref);
kmem_cache_free(flow_cachep, fle);
flow_count(cpu)--;
}
static void __flow_cache_shrink(int cpu, int shrink_to)
{
struct flow_cache_entry *fle, **flp;
int i;
for (i = 0; i < flow_hash_size; i++) {
int k = 0;
flp = &flow_table(cpu)[i];
while ((fle = *flp) != NULL && k < shrink_to) {
k++;
flp = &fle->next;
}
while ((fle = *flp) != NULL) {
*flp = fle->next;
flow_entry_kill(cpu, fle);
}
}
}
static void flow_cache_shrink(int cpu)
{
int shrink_to = flow_lwm / flow_hash_size;
__flow_cache_shrink(cpu, shrink_to);
}
static void flow_new_hash_rnd(int cpu)
{
get_random_bytes(&flow_hash_rnd(cpu), sizeof(u32));
flow_hash_rnd_recalc(cpu) = 0;
__flow_cache_shrink(cpu, 0);
}
static u32 flow_hash_code(struct flowi *key, int cpu)
{
u32 *k = (u32 *) key;
return (jhash2(k, (sizeof(*key) / sizeof(u32)), flow_hash_rnd(cpu)) &
(flow_hash_size - 1));
}
#if (BITS_PER_LONG == 64)
typedef u64 flow_compare_t;
#else
typedef u32 flow_compare_t;
#endif
extern void flowi_is_missized(void);
/* I hear what you're saying, use memcmp. But memcmp cannot make
* important assumptions that we can here, such as alignment and
* constant size.
*/
static int flow_key_compare(struct flowi *key1, struct flowi *key2)
{
flow_compare_t *k1, *k1_lim, *k2;
const int n_elem = sizeof(struct flowi) / sizeof(flow_compare_t);
if (sizeof(struct flowi) % sizeof(flow_compare_t))
flowi_is_missized();
k1 = (flow_compare_t *) key1;
k1_lim = k1 + n_elem;
k2 = (flow_compare_t *) key2;
do {
if (*k1++ != *k2++)
return 1;
} while (k1 < k1_lim);
return 0;
}
void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
flow_resolve_t resolver)
{
struct flow_cache_entry *fle, **head;
unsigned int hash;
int cpu;
local_bh_disable();
cpu = smp_processor_id();
fle = NULL;
/* Packet really early in init? Making flow_cache_init a
* pre-smp initcall would solve this. --RR */
if (!flow_table(cpu))
goto nocache;
if (flow_hash_rnd_recalc(cpu))
flow_new_hash_rnd(cpu);
hash = flow_hash_code(key, cpu);
head = &flow_table(cpu)[hash];
for (fle = *head; fle; fle = fle->next) {
if (fle->family == family &&
fle->dir == dir &&
flow_key_compare(key, &fle->key) == 0) {
if (fle->genid == atomic_read(&flow_cache_genid)) {
void *ret = fle->object;
if (ret)
atomic_inc(fle->object_ref);
local_bh_enable();
return ret;
}
break;
}
}
if (!fle) {
if (flow_count(cpu) > flow_hwm)
flow_cache_shrink(cpu);
fle = kmem_cache_alloc(flow_cachep, SLAB_ATOMIC);
if (fle) {
fle->next = *head;
*head = fle;
fle->family = family;
fle->dir = dir;
memcpy(&fle->key, key, sizeof(*key));
fle->object = NULL;
flow_count(cpu)++;
}
}
nocache:
{
int err;
void *obj;
atomic_t *obj_ref;
err = resolver(key, family, dir, &obj, &obj_ref);
if (fle) {
if (err) {
/* Force security policy check on next lookup */
*head = fle->next;
flow_entry_kill(cpu, fle);
} else {
fle->genid = atomic_read(&flow_cache_genid);
if (fle->object)
atomic_dec(fle->object_ref);
fle->object = obj;
fle->object_ref = obj_ref;
if (obj)
atomic_inc(fle->object_ref);
}
}
local_bh_enable();
if (err)
obj = ERR_PTR(err);
return obj;
}
}
static void flow_cache_flush_tasklet(unsigned long data)
{
struct flow_flush_info *info = (void *)data;
int i;
int cpu;
cpu = smp_processor_id();
for (i = 0; i < flow_hash_size; i++) {
struct flow_cache_entry *fle;
fle = flow_table(cpu)[i];
for (; fle; fle = fle->next) {
unsigned genid = atomic_read(&flow_cache_genid);
if (!fle->object || fle->genid == genid)
continue;
fle->object = NULL;
atomic_dec(fle->object_ref);
}
}
if (atomic_dec_and_test(&info->cpuleft))
complete(&info->completion);
}
static void flow_cache_flush_per_cpu(void *) __attribute__((__unused__));
static void flow_cache_flush_per_cpu(void *data)
{
struct flow_flush_info *info = data;
int cpu;
struct tasklet_struct *tasklet;
cpu = smp_processor_id();
tasklet = flow_flush_tasklet(cpu);
tasklet->data = (unsigned long)info;
tasklet_schedule(tasklet);
}
void flow_cache_flush(void)
{
struct flow_flush_info info;
static DEFINE_MUTEX(flow_flush_sem);
/* Don't want cpus going down or up during this. */
lock_cpu_hotplug();
mutex_lock(&flow_flush_sem);
atomic_set(&info.cpuleft, num_online_cpus());
init_completion(&info.completion);
local_bh_disable();
smp_call_function(flow_cache_flush_per_cpu, &info, 1, 0);
flow_cache_flush_tasklet((unsigned long)&info);
local_bh_enable();
wait_for_completion(&info.completion);
mutex_unlock(&flow_flush_sem);
unlock_cpu_hotplug();
}
static void __devinit flow_cache_cpu_prepare(int cpu)
{
struct tasklet_struct *tasklet;
unsigned long order;
for (order = 0;
(PAGE_SIZE << order) <
(sizeof(struct flow_cache_entry *)*flow_hash_size);
order++)
/* NOTHING */;
flow_table(cpu) = (struct flow_cache_entry **)
__get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
if (!flow_table(cpu))
panic("NET: failed to allocate flow cache order %lu\n", order);
flow_hash_rnd_recalc(cpu) = 1;
flow_count(cpu) = 0;
tasklet = flow_flush_tasklet(cpu);
tasklet_init(tasklet, flow_cache_flush_tasklet, 0);
}
#ifdef CONFIG_HOTPLUG_CPU
static int flow_cache_cpu(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
if (action == CPU_DEAD)
__flow_cache_shrink((unsigned long)hcpu, 0);
return NOTIFY_OK;
}
#endif /* CONFIG_HOTPLUG_CPU */
static int __init flow_cache_init(void)
{
int i;
flow_cachep = kmem_cache_create("flow_cache",
sizeof(struct flow_cache_entry),
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
NULL, NULL);
flow_hash_shift = 10;
flow_lwm = 2 * flow_hash_size;
flow_hwm = 4 * flow_hash_size;
init_timer(&flow_hash_rnd_timer);
flow_hash_rnd_timer.function = flow_cache_new_hashrnd;
flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
add_timer(&flow_hash_rnd_timer);
for_each_possible_cpu(i)
flow_cache_cpu_prepare(i);
hotcpu_notifier(flow_cache_cpu, 0);
return 0;
}
module_init(flow_cache_init);
EXPORT_SYMBOL(flow_cache_genid);
EXPORT_SYMBOL(flow_cache_lookup);