93bb0ceb75
nf_conntrack_lock is a monolithic lock and suffers from huge contention on current generation servers (8 or more core/threads). Perf locking congestion is clear on base kernel: - 72.56% ksoftirqd/6 [kernel.kallsyms] [k] _raw_spin_lock_bh - _raw_spin_lock_bh + 25.33% init_conntrack + 24.86% nf_ct_delete_from_lists + 24.62% __nf_conntrack_confirm + 24.38% destroy_conntrack + 0.70% tcp_packet + 2.21% ksoftirqd/6 [kernel.kallsyms] [k] fib_table_lookup + 1.15% ksoftirqd/6 [kernel.kallsyms] [k] __slab_free + 0.77% ksoftirqd/6 [kernel.kallsyms] [k] inet_getpeer + 0.70% ksoftirqd/6 [nf_conntrack] [k] nf_ct_delete + 0.55% ksoftirqd/6 [ip_tables] [k] ipt_do_table This patch change conntrack locking and provides a huge performance improvement. SYN-flood attack tested on a 24-core E5-2695v2(ES) with 10Gbit/s ixgbe (with tool trafgen): Base kernel: 810.405 new conntrack/sec After patch: 2.233.876 new conntrack/sec Notice other floods attack (SYN+ACK or ACK) can easily be deflected using: # iptables -A INPUT -m state --state INVALID -j DROP # sysctl -w net/netfilter/nf_conntrack_tcp_loose=0 Use an array of hashed spinlocks to protect insertions/deletions of conntracks into the hash table. 1024 spinlocks seem to give good results, at minimal cost (4KB memory). Due to lockdep max depth, 1024 becomes 8 if CONFIG_LOCKDEP=y The hash resize is a bit tricky, because we need to take all locks in the array. A seqcount_t is used to synchronize the hash table users with the resizing process. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net> Reviewed-by: Florian Westphal <fw@strlen.de> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
90 lines
2.8 KiB
C
90 lines
2.8 KiB
C
/*
|
|
* This header is used to share core functionality between the
|
|
* standalone connection tracking module, and the compatibility layer's use
|
|
* of connection tracking.
|
|
*
|
|
* 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
|
|
* - generalize L3 protocol dependent part.
|
|
*
|
|
* Derived from include/linux/netfiter_ipv4/ip_conntrack_core.h
|
|
*/
|
|
|
|
#ifndef _NF_CONNTRACK_CORE_H
|
|
#define _NF_CONNTRACK_CORE_H
|
|
|
|
#include <linux/netfilter.h>
|
|
#include <net/netfilter/nf_conntrack_l3proto.h>
|
|
#include <net/netfilter/nf_conntrack_l4proto.h>
|
|
#include <net/netfilter/nf_conntrack_ecache.h>
|
|
|
|
/* This header is used to share core functionality between the
|
|
standalone connection tracking module, and the compatibility layer's use
|
|
of connection tracking. */
|
|
unsigned int nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
|
|
struct sk_buff *skb);
|
|
|
|
int nf_conntrack_init_net(struct net *net);
|
|
void nf_conntrack_cleanup_net(struct net *net);
|
|
void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list);
|
|
|
|
int nf_conntrack_proto_pernet_init(struct net *net);
|
|
void nf_conntrack_proto_pernet_fini(struct net *net);
|
|
|
|
int nf_conntrack_proto_init(void);
|
|
void nf_conntrack_proto_fini(void);
|
|
|
|
int nf_conntrack_init_start(void);
|
|
void nf_conntrack_cleanup_start(void);
|
|
|
|
void nf_conntrack_init_end(void);
|
|
void nf_conntrack_cleanup_end(void);
|
|
|
|
bool nf_ct_get_tuple(const struct sk_buff *skb, unsigned int nhoff,
|
|
unsigned int dataoff, u_int16_t l3num, u_int8_t protonum,
|
|
struct nf_conntrack_tuple *tuple,
|
|
const struct nf_conntrack_l3proto *l3proto,
|
|
const struct nf_conntrack_l4proto *l4proto);
|
|
|
|
bool nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
|
|
const struct nf_conntrack_tuple *orig,
|
|
const struct nf_conntrack_l3proto *l3proto,
|
|
const struct nf_conntrack_l4proto *l4proto);
|
|
|
|
/* Find a connection corresponding to a tuple. */
|
|
struct nf_conntrack_tuple_hash *
|
|
nf_conntrack_find_get(struct net *net, u16 zone,
|
|
const struct nf_conntrack_tuple *tuple);
|
|
|
|
int __nf_conntrack_confirm(struct sk_buff *skb);
|
|
|
|
/* Confirm a connection: returns NF_DROP if packet must be dropped. */
|
|
static inline int nf_conntrack_confirm(struct sk_buff *skb)
|
|
{
|
|
struct nf_conn *ct = (struct nf_conn *)skb->nfct;
|
|
int ret = NF_ACCEPT;
|
|
|
|
if (ct && !nf_ct_is_untracked(ct)) {
|
|
if (!nf_ct_is_confirmed(ct))
|
|
ret = __nf_conntrack_confirm(skb);
|
|
if (likely(ret == NF_ACCEPT))
|
|
nf_ct_deliver_cached_events(ct);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
int
|
|
print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
|
|
const struct nf_conntrack_l3proto *l3proto,
|
|
const struct nf_conntrack_l4proto *proto);
|
|
|
|
#ifdef CONFIG_LOCKDEP
|
|
# define CONNTRACK_LOCKS 8
|
|
#else
|
|
# define CONNTRACK_LOCKS 1024
|
|
#endif
|
|
extern spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
|
|
|
|
extern spinlock_t nf_conntrack_expect_lock;
|
|
|
|
#endif /* _NF_CONNTRACK_CORE_H */
|