kernel-ark/net/core/flow.c

368 lines
8.0 KiB
C
Raw Normal View History

/* flow.c: Generic flow cache.
*
* Copyright (C) 2003 Alexey N. Kuznetsov (kuznet@ms2.inr.ac.ru)
* Copyright (C) 2003 David S. Miller (davem@redhat.com)
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/list.h>
#include <linux/jhash.h>
#include <linux/interrupt.h>
#include <linux/mm.h>
#include <linux/random.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/smp.h>
#include <linux/completion.h>
#include <linux/percpu.h>
#include <linux/bitops.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/mutex.h>
#include <net/flow.h>
#include <asm/atomic.h>
#include <asm/semaphore.h>
[LSM-IPSec]: Security association restriction. This patch series implements per packet access control via the extension of the Linux Security Modules (LSM) interface by hooks in the XFRM and pfkey subsystems that leverage IPSec security associations to label packets. Extensions to the SELinux LSM are included that leverage the patch for this purpose. This patch implements the changes necessary to the XFRM subsystem, pfkey interface, ipv4/ipv6, and xfrm_user interface to restrict a socket to use only authorized security associations (or no security association) to send/receive network packets. Patch purpose: The patch is designed to enable access control per packets based on the strongly authenticated IPSec security association. Such access controls augment the existing ones based on network interface and IP address. The former are very coarse-grained, and the latter can be spoofed. By using IPSec, the system can control access to remote hosts based on cryptographic keys generated using the IPSec mechanism. This enables access control on a per-machine basis or per-application if the remote machine is running the same mechanism and trusted to enforce the access control policy. Patch design approach: The overall approach is that policy (xfrm_policy) entries set by user-level programs (e.g., setkey for ipsec-tools) are extended with a security context that is used at policy selection time in the XFRM subsystem to restrict the sockets that can send/receive packets via security associations (xfrm_states) that are built from those policies. A presentation available at www.selinux-symposium.org/2005/presentations/session2/2-3-jaeger.pdf from the SELinux symposium describes the overall approach. Patch implementation details: On output, the policy retrieved (via xfrm_policy_lookup or xfrm_sk_policy_lookup) must be authorized for the security context of the socket and the same security context is required for resultant security association (retrieved or negotiated via racoon in ipsec-tools). This is enforced in xfrm_state_find. On input, the policy retrieved must also be authorized for the socket (at __xfrm_policy_check), and the security context of the policy must also match the security association being used. The patch has virtually no impact on packets that do not use IPSec. The existing Netfilter (outgoing) and LSM rcv_skb hooks are used as before. Also, if IPSec is used without security contexts, the impact is minimal. The LSM must allow such policies to be selected for the combination of socket and remote machine, but subsequent IPSec processing proceeds as in the original case. Testing: The pfkey interface is tested using the ipsec-tools. ipsec-tools have been modified (a separate ipsec-tools patch is available for version 0.5) that supports assignment of xfrm_policy entries and security associations with security contexts via setkey and the negotiation using the security contexts via racoon. The xfrm_user interface is tested via ad hoc programs that set security contexts. These programs are also available from me, and contain programs for setting, getting, and deleting policy for testing this interface. Testing of sa functions was done by tracing kernel behavior. Signed-off-by: Trent Jaeger <tjaeger@cse.psu.edu> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Signed-off-by: David S. Miller <davem@davemloft.net>
2005-12-14 07:12:27 +00:00
#include <linux/security.h>
struct flow_cache_entry {
struct flow_cache_entry *next;
u16 family;
u8 dir;
struct flowi key;
u32 genid;
void *object;
atomic_t *object_ref;
};
atomic_t flow_cache_genid = ATOMIC_INIT(0);
static u32 flow_hash_shift;
#define flow_hash_size (1 << flow_hash_shift)
static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL };
#define flow_table(cpu) (per_cpu(flow_tables, cpu))
static kmem_cache_t *flow_cachep __read_mostly;
static int flow_lwm, flow_hwm;
struct flow_percpu_info {
int hash_rnd_recalc;
u32 hash_rnd;
int count;
} ____cacheline_aligned;
static DEFINE_PER_CPU(struct flow_percpu_info, flow_hash_info) = { 0 };
#define flow_hash_rnd_recalc(cpu) \
(per_cpu(flow_hash_info, cpu).hash_rnd_recalc)
#define flow_hash_rnd(cpu) \
(per_cpu(flow_hash_info, cpu).hash_rnd)
#define flow_count(cpu) \
(per_cpu(flow_hash_info, cpu).count)
static struct timer_list flow_hash_rnd_timer;
#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ)
struct flow_flush_info {
atomic_t cpuleft;
struct completion completion;
};
static DEFINE_PER_CPU(struct tasklet_struct, flow_flush_tasklets) = { NULL };
#define flow_flush_tasklet(cpu) (&per_cpu(flow_flush_tasklets, cpu))
static void flow_cache_new_hashrnd(unsigned long arg)
{
int i;
for_each_possible_cpu(i)
flow_hash_rnd_recalc(i) = 1;
flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
add_timer(&flow_hash_rnd_timer);
}
static void __flow_cache_shrink(int cpu, int shrink_to)
{
struct flow_cache_entry *fle, **flp;
int i;
for (i = 0; i < flow_hash_size; i++) {
int k = 0;
flp = &flow_table(cpu)[i];
while ((fle = *flp) != NULL && k < shrink_to) {
k++;
flp = &fle->next;
}
while ((fle = *flp) != NULL) {
*flp = fle->next;
if (fle->object)
atomic_dec(fle->object_ref);
kmem_cache_free(flow_cachep, fle);
flow_count(cpu)--;
}
}
}
static void flow_cache_shrink(int cpu)
{
int shrink_to = flow_lwm / flow_hash_size;
__flow_cache_shrink(cpu, shrink_to);
}
static void flow_new_hash_rnd(int cpu)
{
get_random_bytes(&flow_hash_rnd(cpu), sizeof(u32));
flow_hash_rnd_recalc(cpu) = 0;
__flow_cache_shrink(cpu, 0);
}
static u32 flow_hash_code(struct flowi *key, int cpu)
{
u32 *k = (u32 *) key;
return (jhash2(k, (sizeof(*key) / sizeof(u32)), flow_hash_rnd(cpu)) &
(flow_hash_size - 1));
}
#if (BITS_PER_LONG == 64)
typedef u64 flow_compare_t;
#else
typedef u32 flow_compare_t;
#endif
extern void flowi_is_missized(void);
/* I hear what you're saying, use memcmp. But memcmp cannot make
* important assumptions that we can here, such as alignment and
* constant size.
*/
static int flow_key_compare(struct flowi *key1, struct flowi *key2)
{
flow_compare_t *k1, *k1_lim, *k2;
const int n_elem = sizeof(struct flowi) / sizeof(flow_compare_t);
if (sizeof(struct flowi) % sizeof(flow_compare_t))
flowi_is_missized();
k1 = (flow_compare_t *) key1;
k1_lim = k1 + n_elem;
k2 = (flow_compare_t *) key2;
do {
if (*k1++ != *k2++)
return 1;
} while (k1 < k1_lim);
return 0;
}
void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
flow_resolve_t resolver)
{
struct flow_cache_entry *fle, **head;
unsigned int hash;
int cpu;
local_bh_disable();
cpu = smp_processor_id();
fle = NULL;
/* Packet really early in init? Making flow_cache_init a
* pre-smp initcall would solve this. --RR */
if (!flow_table(cpu))
goto nocache;
if (flow_hash_rnd_recalc(cpu))
flow_new_hash_rnd(cpu);
hash = flow_hash_code(key, cpu);
head = &flow_table(cpu)[hash];
for (fle = *head; fle; fle = fle->next) {
if (fle->family == family &&
fle->dir == dir &&
flow_key_compare(key, &fle->key) == 0) {
if (fle->genid == atomic_read(&flow_cache_genid)) {
void *ret = fle->object;
if (ret)
atomic_inc(fle->object_ref);
local_bh_enable();
return ret;
}
break;
}
}
if (!fle) {
if (flow_count(cpu) > flow_hwm)
flow_cache_shrink(cpu);
fle = kmem_cache_alloc(flow_cachep, SLAB_ATOMIC);
if (fle) {
fle->next = *head;
*head = fle;
fle->family = family;
fle->dir = dir;
memcpy(&fle->key, key, sizeof(*key));
fle->object = NULL;
flow_count(cpu)++;
}
}
nocache:
{
void *obj;
atomic_t *obj_ref;
resolver(key, family, dir, &obj, &obj_ref);
if (fle) {
fle->genid = atomic_read(&flow_cache_genid);
if (fle->object)
atomic_dec(fle->object_ref);
fle->object = obj;
fle->object_ref = obj_ref;
if (obj)
atomic_inc(fle->object_ref);
}
local_bh_enable();
return obj;
}
}
static void flow_cache_flush_tasklet(unsigned long data)
{
struct flow_flush_info *info = (void *)data;
int i;
int cpu;
cpu = smp_processor_id();
for (i = 0; i < flow_hash_size; i++) {
struct flow_cache_entry *fle;
fle = flow_table(cpu)[i];
for (; fle; fle = fle->next) {
unsigned genid = atomic_read(&flow_cache_genid);
if (!fle->object || fle->genid == genid)
continue;
fle->object = NULL;
atomic_dec(fle->object_ref);
}
}
if (atomic_dec_and_test(&info->cpuleft))
complete(&info->completion);
}
static void flow_cache_flush_per_cpu(void *) __attribute__((__unused__));
static void flow_cache_flush_per_cpu(void *data)
{
struct flow_flush_info *info = data;
int cpu;
struct tasklet_struct *tasklet;
cpu = smp_processor_id();
tasklet = flow_flush_tasklet(cpu);
tasklet->data = (unsigned long)info;
tasklet_schedule(tasklet);
}
void flow_cache_flush(void)
{
struct flow_flush_info info;
static DEFINE_MUTEX(flow_flush_sem);
/* Don't want cpus going down or up during this. */
lock_cpu_hotplug();
mutex_lock(&flow_flush_sem);
atomic_set(&info.cpuleft, num_online_cpus());
init_completion(&info.completion);
local_bh_disable();
smp_call_function(flow_cache_flush_per_cpu, &info, 1, 0);
flow_cache_flush_tasklet((unsigned long)&info);
local_bh_enable();
wait_for_completion(&info.completion);
mutex_unlock(&flow_flush_sem);
unlock_cpu_hotplug();
}
static void __devinit flow_cache_cpu_prepare(int cpu)
{
struct tasklet_struct *tasklet;
unsigned long order;
for (order = 0;
(PAGE_SIZE << order) <
(sizeof(struct flow_cache_entry *)*flow_hash_size);
order++)
/* NOTHING */;
flow_table(cpu) = (struct flow_cache_entry **)
__get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
if (!flow_table(cpu))
panic("NET: failed to allocate flow cache order %lu\n", order);
flow_hash_rnd_recalc(cpu) = 1;
flow_count(cpu) = 0;
tasklet = flow_flush_tasklet(cpu);
tasklet_init(tasklet, flow_cache_flush_tasklet, 0);
}
#ifdef CONFIG_HOTPLUG_CPU
static int flow_cache_cpu(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
if (action == CPU_DEAD)
__flow_cache_shrink((unsigned long)hcpu, 0);
return NOTIFY_OK;
}
#endif /* CONFIG_HOTPLUG_CPU */
static int __init flow_cache_init(void)
{
int i;
flow_cachep = kmem_cache_create("flow_cache",
sizeof(struct flow_cache_entry),
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
NULL, NULL);
flow_hash_shift = 10;
flow_lwm = 2 * flow_hash_size;
flow_hwm = 4 * flow_hash_size;
init_timer(&flow_hash_rnd_timer);
flow_hash_rnd_timer.function = flow_cache_new_hashrnd;
flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
add_timer(&flow_hash_rnd_timer);
for_each_possible_cpu(i)
flow_cache_cpu_prepare(i);
hotcpu_notifier(flow_cache_cpu, 0);
return 0;
}
module_init(flow_cache_init);
EXPORT_SYMBOL(flow_cache_genid);
EXPORT_SYMBOL(flow_cache_lookup);