net: optimize INET input path further

Followup of commit b178bb3dfc30 (net: reorder struct sock fields)

Optimize INET input path a bit further, by :

1) moving sk_refcnt close to sk_lock.

This reduces number of dirtied cache lines by one on 64bit arches (and
64 bytes cache line size).

2) moving inet_daddr & inet_rcv_saddr at the beginning of sk

(same cache line than hash / family / bound_dev_if / nulls_node)

This reduces number of accessed cache lines in lookups by one, and dont
increase size of inet and timewait socks.
inet and tw sockets now share same place-holder for these fields.

Before patch :

offsetof(struct sock, sk_refcnt) = 0x10
offsetof(struct sock, sk_lock) = 0x40
offsetof(struct sock, sk_receive_queue) = 0x60
offsetof(struct inet_sock, inet_daddr) = 0x270
offsetof(struct inet_sock, inet_rcv_saddr) = 0x274

After patch :

offsetof(struct sock, sk_refcnt) = 0x44
offsetof(struct sock, sk_lock) = 0x48
offsetof(struct sock, sk_receive_queue) = 0x68
offsetof(struct inet_sock, inet_daddr) = 0x0
offsetof(struct inet_sock, inet_rcv_saddr) = 0x4

compute_score() (udp or tcp) now use a single cache line per ignored
item, instead of two.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index a066fdd..17404b5 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -88,12 +88,6 @@
 extern void inet_twdr_twkill_work(struct work_struct *work);
 extern void inet_twdr_twcal_tick(unsigned long data);
 
-#if (BITS_PER_LONG == 64)
-#define INET_TIMEWAIT_ADDRCMP_ALIGN_BYTES 8
-#else
-#define INET_TIMEWAIT_ADDRCMP_ALIGN_BYTES 4
-#endif
-
 struct inet_bind_bucket;
 
 /*
@@ -117,15 +111,15 @@
 #define tw_hash			__tw_common.skc_hash
 #define tw_prot			__tw_common.skc_prot
 #define tw_net			__tw_common.skc_net
+#define tw_daddr        	__tw_common.skc_daddr
+#define tw_rcv_saddr    	__tw_common.skc_rcv_saddr
 	int			tw_timeout;
 	volatile unsigned char	tw_substate;
-	/* 3 bits hole, try to pack */
 	unsigned char		tw_rcv_wscale;
+
 	/* Socket demultiplex comparisons on incoming packets. */
-	/* these five are in inet_sock */
+	/* these three are in inet_sock */
 	__be16			tw_sport;
-	__be32			tw_daddr __attribute__((aligned(INET_TIMEWAIT_ADDRCMP_ALIGN_BYTES)));
-	__be32			tw_rcv_saddr;
 	__be16			tw_dport;
 	__u16			tw_num;
 	kmemcheck_bitfield_begin(flags);
@@ -191,10 +185,10 @@
 	return (struct inet_timewait_sock *)sk;
 }
 
-static inline __be32 inet_rcv_saddr(const struct sock *sk)
+static inline __be32 sk_rcv_saddr(const struct sock *sk)
 {
-	return likely(sk->sk_state != TCP_TIME_WAIT) ?
-		inet_sk(sk)->inet_rcv_saddr : inet_twsk(sk)->tw_rcv_saddr;
+/* both inet_sk() and inet_twsk() store rcv_saddr in skc_rcv_saddr */
+	return sk->__sk_common.skc_rcv_saddr;
 }
 
 extern void inet_twsk_put(struct inet_timewait_sock *tw);