inetpeer: get rid of ip_id_count
[ Upstream commit 73f156a6e8c1074ac6327e0abd1169e95eb66463 ]
Ideally, we would need to generate IP ID using a per destination IP
generator.
linux kernels used inet_peer cache for this purpose, but this had a huge
cost on servers disabling MTU discovery.
1) each inet_peer struct consumes 192 bytes
2) inetpeer cache uses a binary tree of inet_peer structs,
with a nominal size of ~66000 elements under load.
3) lookups in this tree are hitting a lot of cache lines, as tree depth
is about 20.
4) If server deals with many tcp flows, we have a high probability of
not finding the inet_peer, allocating a fresh one, inserting it in
the tree with same initial ip_id_count, (cf secure_ip_id())
5) We garbage collect inet_peer aggressively.
IP ID generation do not have to be 'perfect'
Goal is trying to avoid duplicates in a short period of time,
so that reassembly units have a chance to complete reassembly of
fragments belonging to one message before receiving other fragments
with a recycled ID.
We simply use an array of generators, and a Jenkin hash using the dst IP
as a key.
ipv6_select_ident() is put back into net/ipv6/ip6_output.c where it
belongs (it is only used from this file)
secure_ip_id() and secure_ipv6_id() no longer are needed.
Rename ip_select_ident_more() to ip_select_ident_segs() to avoid
unnecessary decrement/increment of the number of segments.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 90bc88b..2836f56 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1341,46 +1341,23 @@
rt->rt_peer_genid = rt_peer_genid();
}
-/*
- * Peer allocation may fail only in serious out-of-memory conditions. However
- * we still can generate some output.
- * Random ID selection looks a bit dangerous because we have no chances to
- * select ID being unique in a reasonable period of time.
- * But broken packet identifier may be better than no packet at all.
- */
-static void ip_select_fb_ident(struct iphdr *iph)
+atomic_t *ip_idents __read_mostly;
+EXPORT_SYMBOL(ip_idents);
+
+void __ip_select_ident(struct iphdr *iph, int segs)
{
- static DEFINE_SPINLOCK(ip_fb_id_lock);
- static u32 ip_fallback_id;
- u32 salt;
+ static u32 ip_idents_hashrnd __read_mostly;
+ static bool hashrnd_initialized = false;
+ u32 hash, id;
- spin_lock_bh(&ip_fb_id_lock);
- salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
- iph->id = htons(salt & 0xFFFF);
- ip_fallback_id = salt;
- spin_unlock_bh(&ip_fb_id_lock);
-}
+ if (unlikely(!hashrnd_initialized)) {
+ hashrnd_initialized = true;
+ get_random_bytes(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
+ }
-void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
-{
- struct rtable *rt = (struct rtable *) dst;
-
- if (rt && !(rt->dst.flags & DST_NOPEER)) {
- if (rt->peer == NULL)
- rt_bind_peer(rt, rt->rt_dst, 1);
-
- /* If peer is attached to destination, it is never detached,
- so that we need not to grab a lock to dereference it.
- */
- if (rt->peer) {
- iph->id = htons(inet_getid(rt->peer, more));
- return;
- }
- } else if (!rt)
- printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
- __builtin_return_address(0));
-
- ip_select_fb_ident(iph);
+ hash = jhash_1word((__force u32)iph->daddr, ip_idents_hashrnd);
+ id = ip_idents_reserve(hash, segs);
+ iph->id = htons(id);
}
EXPORT_SYMBOL(__ip_select_ident);
@@ -3009,7 +2986,6 @@
error = rt->dst.error;
if (peer) {
inet_peer_refcheck(rt->peer);
- id = atomic_read(&peer->ip_id_count) & 0xffff;
if (peer->tcp_ts_stamp) {
ts = peer->tcp_ts;
tsage = get_seconds() - peer->tcp_ts_stamp;
@@ -3441,6 +3417,12 @@
{
int rc = 0;
+ ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
+ if (!ip_idents)
+ panic("IP: failed to allocate ip_idents\n");
+
+ get_random_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
+
#ifdef CONFIG_IP_ROUTE_CLASSID
ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
if (!ip_rt_acct)