kernel-ark/net/ipv6/sysctl_net_ipv6.c
Tom Herbert 82a584b7cd ipv6: Flow label state ranges
This patch divides the IPv6 flow label space into two ranges:
0-7ffff is reserved for flow label manager, 80000-fffff will be
used for creating auto flow labels (per RFC6438). This only affects how
labels are set on transmit, it does not affect receive. This range split
can be disbaled by systcl.

Background:

IPv6 flow labels have been an unmitigated disappointment thus far
in the lifetime of IPv6. Support in HW devices to use them for ECMP
is lacking, and OSes don't turn them on by default. If we had these
we could get much better hashing in IPv6 networks without resorting
to DPI, possibly eliminating some of the motivations to to define new
encaps in UDP just for getting ECMP.

Unfortunately, the initial specfications of IPv6 did not clarify
how they are to be used. There has always been a vague concept that
these can be used for ECMP, flow hashing, etc. and we do now have a
good standard how to this in RFC6438. The problem is that flow labels
can be either stateful or stateless (as in RFC6438), and we are
presented with the possibility that a stateless label may collide
with a stateful one.  Attempts to split the flow label space were
rejected in IETF. When we added support in Linux for RFC6438, we
could not turn on flow labels by default due to this conflict.

This patch splits the flow label space and should give us
a path to enabling auto flow labels by default for all IPv6 packets.
This is an API change so we need to consider compatibility with
existing deployment. The stateful range is chosen to be the lower
values in hopes that most uses would have chosen small numbers.

Once we resolve the stateless/stateful issue, we can proceed to
look at enabling RFC6438 flow labels by default (starting with
scaled testing).

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-03 21:58:01 -04:00

209 lines
5.0 KiB
C

/*
* sysctl_net_ipv6.c: sysctl interface to net IPV6 subsystem.
*
* Changes:
* YOSHIFUJI Hideaki @USAGI: added icmp sysctl table.
*/
#include <linux/mm.h>
#include <linux/sysctl.h>
#include <linux/in6.h>
#include <linux/ipv6.h>
#include <linux/slab.h>
#include <linux/export.h>
#include <net/ndisc.h>
#include <net/ipv6.h>
#include <net/addrconf.h>
#include <net/inet_frag.h>
static int one = 1;
static struct ctl_table ipv6_table_template[] = {
{
.procname = "bindv6only",
.data = &init_net.ipv6.sysctl.bindv6only,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
{
.procname = "anycast_src_echo_reply",
.data = &init_net.ipv6.sysctl.anycast_src_echo_reply,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
{
.procname = "flowlabel_consistency",
.data = &init_net.ipv6.sysctl.flowlabel_consistency,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
{
.procname = "auto_flowlabels",
.data = &init_net.ipv6.sysctl.auto_flowlabels,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
{
.procname = "fwmark_reflect",
.data = &init_net.ipv6.sysctl.fwmark_reflect,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
{
.procname = "idgen_retries",
.data = &init_net.ipv6.sysctl.idgen_retries,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "idgen_delay",
.data = &init_net.ipv6.sysctl.idgen_delay,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "flowlabel_state_ranges",
.data = &init_net.ipv6.sysctl.flowlabel_state_ranges,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
{ }
};
static struct ctl_table ipv6_rotable[] = {
{
.procname = "mld_max_msf",
.data = &sysctl_mld_max_msf,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
{
.procname = "mld_qrv",
.data = &sysctl_mld_qrv,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &one
},
{ }
};
static int __net_init ipv6_sysctl_net_init(struct net *net)
{
struct ctl_table *ipv6_table;
struct ctl_table *ipv6_route_table;
struct ctl_table *ipv6_icmp_table;
int err;
err = -ENOMEM;
ipv6_table = kmemdup(ipv6_table_template, sizeof(ipv6_table_template),
GFP_KERNEL);
if (!ipv6_table)
goto out;
ipv6_table[0].data = &net->ipv6.sysctl.bindv6only;
ipv6_table[1].data = &net->ipv6.sysctl.anycast_src_echo_reply;
ipv6_table[2].data = &net->ipv6.sysctl.flowlabel_consistency;
ipv6_table[3].data = &net->ipv6.sysctl.auto_flowlabels;
ipv6_table[4].data = &net->ipv6.sysctl.fwmark_reflect;
ipv6_table[5].data = &net->ipv6.sysctl.idgen_retries;
ipv6_table[6].data = &net->ipv6.sysctl.idgen_delay;
ipv6_table[7].data = &net->ipv6.sysctl.flowlabel_state_ranges;
ipv6_route_table = ipv6_route_sysctl_init(net);
if (!ipv6_route_table)
goto out_ipv6_table;
ipv6_icmp_table = ipv6_icmp_sysctl_init(net);
if (!ipv6_icmp_table)
goto out_ipv6_route_table;
net->ipv6.sysctl.hdr = register_net_sysctl(net, "net/ipv6", ipv6_table);
if (!net->ipv6.sysctl.hdr)
goto out_ipv6_icmp_table;
net->ipv6.sysctl.route_hdr =
register_net_sysctl(net, "net/ipv6/route", ipv6_route_table);
if (!net->ipv6.sysctl.route_hdr)
goto out_unregister_ipv6_table;
net->ipv6.sysctl.icmp_hdr =
register_net_sysctl(net, "net/ipv6/icmp", ipv6_icmp_table);
if (!net->ipv6.sysctl.icmp_hdr)
goto out_unregister_route_table;
err = 0;
out:
return err;
out_unregister_route_table:
unregister_net_sysctl_table(net->ipv6.sysctl.route_hdr);
out_unregister_ipv6_table:
unregister_net_sysctl_table(net->ipv6.sysctl.hdr);
out_ipv6_icmp_table:
kfree(ipv6_icmp_table);
out_ipv6_route_table:
kfree(ipv6_route_table);
out_ipv6_table:
kfree(ipv6_table);
goto out;
}
static void __net_exit ipv6_sysctl_net_exit(struct net *net)
{
struct ctl_table *ipv6_table;
struct ctl_table *ipv6_route_table;
struct ctl_table *ipv6_icmp_table;
ipv6_table = net->ipv6.sysctl.hdr->ctl_table_arg;
ipv6_route_table = net->ipv6.sysctl.route_hdr->ctl_table_arg;
ipv6_icmp_table = net->ipv6.sysctl.icmp_hdr->ctl_table_arg;
unregister_net_sysctl_table(net->ipv6.sysctl.icmp_hdr);
unregister_net_sysctl_table(net->ipv6.sysctl.route_hdr);
unregister_net_sysctl_table(net->ipv6.sysctl.hdr);
kfree(ipv6_table);
kfree(ipv6_route_table);
kfree(ipv6_icmp_table);
}
static struct pernet_operations ipv6_sysctl_net_ops = {
.init = ipv6_sysctl_net_init,
.exit = ipv6_sysctl_net_exit,
};
static struct ctl_table_header *ip6_header;
int ipv6_sysctl_register(void)
{
int err = -ENOMEM;
ip6_header = register_net_sysctl(&init_net, "net/ipv6", ipv6_rotable);
if (!ip6_header)
goto out;
err = register_pernet_subsys(&ipv6_sysctl_net_ops);
if (err)
goto err_pernet;
out:
return err;
err_pernet:
unregister_net_sysctl_table(ip6_header);
goto out;
}
void ipv6_sysctl_unregister(void)
{
unregister_net_sysctl_table(ip6_header);
unregister_pernet_subsys(&ipv6_sysctl_net_ops);
}