93bb0ceb75
nf_conntrack_lock is a monolithic lock and suffers from huge contention on current generation servers (8 or more core/threads). Perf locking congestion is clear on base kernel: - 72.56% ksoftirqd/6 [kernel.kallsyms] [k] _raw_spin_lock_bh - _raw_spin_lock_bh + 25.33% init_conntrack + 24.86% nf_ct_delete_from_lists + 24.62% __nf_conntrack_confirm + 24.38% destroy_conntrack + 0.70% tcp_packet + 2.21% ksoftirqd/6 [kernel.kallsyms] [k] fib_table_lookup + 1.15% ksoftirqd/6 [kernel.kallsyms] [k] __slab_free + 0.77% ksoftirqd/6 [kernel.kallsyms] [k] inet_getpeer + 0.70% ksoftirqd/6 [nf_conntrack] [k] nf_ct_delete + 0.55% ksoftirqd/6 [ip_tables] [k] ipt_do_table This patch change conntrack locking and provides a huge performance improvement. SYN-flood attack tested on a 24-core E5-2695v2(ES) with 10Gbit/s ixgbe (with tool trafgen): Base kernel: 810.405 new conntrack/sec After patch: 2.233.876 new conntrack/sec Notice other floods attack (SYN+ACK or ACK) can easily be deflected using: # iptables -A INPUT -m state --state INVALID -j DROP # sysctl -w net/netfilter/nf_conntrack_tcp_loose=0 Use an array of hashed spinlocks to protect insertions/deletions of conntracks into the hash table. 1024 spinlocks seem to give good results, at minimal cost (4KB memory). Due to lockdep max depth, 1024 becomes 8 if CONFIG_LOCKDEP=y The hash resize is a bit tricky, because we need to take all locks in the array. A seqcount_t is used to synchronize the hash table users with the resizing process. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net> Reviewed-by: Florian Westphal <fw@strlen.de> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
113 lines
2.6 KiB
C
113 lines
2.6 KiB
C
#ifndef __NETNS_CONNTRACK_H
|
|
#define __NETNS_CONNTRACK_H
|
|
|
|
#include <linux/list.h>
|
|
#include <linux/list_nulls.h>
|
|
#include <linux/atomic.h>
|
|
#include <linux/netfilter/nf_conntrack_tcp.h>
|
|
#include <linux/seqlock.h>
|
|
|
|
struct ctl_table_header;
|
|
struct nf_conntrack_ecache;
|
|
|
|
struct nf_proto_net {
|
|
#ifdef CONFIG_SYSCTL
|
|
struct ctl_table_header *ctl_table_header;
|
|
struct ctl_table *ctl_table;
|
|
#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
|
|
struct ctl_table_header *ctl_compat_header;
|
|
struct ctl_table *ctl_compat_table;
|
|
#endif
|
|
#endif
|
|
unsigned int users;
|
|
};
|
|
|
|
struct nf_generic_net {
|
|
struct nf_proto_net pn;
|
|
unsigned int timeout;
|
|
};
|
|
|
|
struct nf_tcp_net {
|
|
struct nf_proto_net pn;
|
|
unsigned int timeouts[TCP_CONNTRACK_TIMEOUT_MAX];
|
|
unsigned int tcp_loose;
|
|
unsigned int tcp_be_liberal;
|
|
unsigned int tcp_max_retrans;
|
|
};
|
|
|
|
enum udp_conntrack {
|
|
UDP_CT_UNREPLIED,
|
|
UDP_CT_REPLIED,
|
|
UDP_CT_MAX
|
|
};
|
|
|
|
struct nf_udp_net {
|
|
struct nf_proto_net pn;
|
|
unsigned int timeouts[UDP_CT_MAX];
|
|
};
|
|
|
|
struct nf_icmp_net {
|
|
struct nf_proto_net pn;
|
|
unsigned int timeout;
|
|
};
|
|
|
|
struct nf_ip_net {
|
|
struct nf_generic_net generic;
|
|
struct nf_tcp_net tcp;
|
|
struct nf_udp_net udp;
|
|
struct nf_icmp_net icmp;
|
|
struct nf_icmp_net icmpv6;
|
|
#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
|
|
struct ctl_table_header *ctl_table_header;
|
|
struct ctl_table *ctl_table;
|
|
#endif
|
|
};
|
|
|
|
struct ct_pcpu {
|
|
spinlock_t lock;
|
|
struct hlist_nulls_head unconfirmed;
|
|
struct hlist_nulls_head dying;
|
|
struct hlist_nulls_head tmpl;
|
|
};
|
|
|
|
struct netns_ct {
|
|
atomic_t count;
|
|
unsigned int expect_count;
|
|
#ifdef CONFIG_SYSCTL
|
|
struct ctl_table_header *sysctl_header;
|
|
struct ctl_table_header *acct_sysctl_header;
|
|
struct ctl_table_header *tstamp_sysctl_header;
|
|
struct ctl_table_header *event_sysctl_header;
|
|
struct ctl_table_header *helper_sysctl_header;
|
|
#endif
|
|
char *slabname;
|
|
unsigned int sysctl_log_invalid; /* Log invalid packets */
|
|
unsigned int sysctl_events_retry_timeout;
|
|
int sysctl_events;
|
|
int sysctl_acct;
|
|
int sysctl_auto_assign_helper;
|
|
bool auto_assign_helper_warned;
|
|
int sysctl_tstamp;
|
|
int sysctl_checksum;
|
|
|
|
unsigned int htable_size;
|
|
seqcount_t generation;
|
|
struct kmem_cache *nf_conntrack_cachep;
|
|
struct hlist_nulls_head *hash;
|
|
struct hlist_head *expect_hash;
|
|
struct ct_pcpu __percpu *pcpu_lists;
|
|
struct ip_conntrack_stat __percpu *stat;
|
|
struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
|
|
struct nf_exp_event_notifier __rcu *nf_expect_event_cb;
|
|
struct nf_ip_net nf_ct_proto;
|
|
#if defined(CONFIG_NF_CONNTRACK_LABELS)
|
|
unsigned int labels_used;
|
|
u8 label_words;
|
|
#endif
|
|
#ifdef CONFIG_NF_NAT_NEEDED
|
|
struct hlist_head *nat_bysource;
|
|
unsigned int nat_htable_size;
|
|
#endif
|
|
};
|
|
#endif
|