net_sched: add 64bit rate estimators

struct gnet_stats_rate_est contains u32 fields, so the bytes per second
field can wrap at 34360Mbit.

Add a new gnet_stats_rate_est64 structure to get 64bit bps/pps fields,
and switch the kernel to use this structure natively.

This structure is dumped to user space as a new attribute :

TCA_STATS_RATE_EST64

Old tc command will now display the capped bps (to 34360Mbit), instead
of wrapped values, and updated tc command will display correct
information.

Old tc command output, after patch :

eric:~# tc -s -d qd sh dev lo
qdisc pfifo 8001: root refcnt 2 limit 1000p
 Sent 80868245400 bytes 1978837 pkt (dropped 0, overlimits 0 requeues 0)
 rate 34360Mbit 189696pps backlog 0b 0p requeues 0

This patch carefully reorganizes "struct Qdisc" layout to get optimal
performance on SMP.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index e7f4e21..df56760 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -58,14 +58,12 @@
 				      * multiqueue device.
 				      */
 #define TCQ_F_WARN_NONWC	(1 << 16)
-	int			padded;
+	u32			limit;
 	const struct Qdisc_ops	*ops;
 	struct qdisc_size_table	__rcu *stab;
 	struct list_head	list;
 	u32			handle;
 	u32			parent;
-	atomic_t		refcnt;
-	struct gnet_stats_rate_est	rate_est;
 	int			(*reshape_fail)(struct sk_buff *skb,
 					struct Qdisc *q);
 
@@ -76,8 +74,9 @@
 	 */
 	struct Qdisc		*__parent;
 	struct netdev_queue	*dev_queue;
-	struct Qdisc		*next_sched;
 
+	struct gnet_stats_rate_est64	rate_est;
+	struct Qdisc		*next_sched;
 	struct sk_buff		*gso_skb;
 	/*
 	 * For performance sake on SMP, we put highly modified fields at the end
@@ -88,8 +87,10 @@
 	unsigned int		__state;
 	struct gnet_stats_queue	qstats;
 	struct rcu_head		rcu_head;
-	spinlock_t		busylock;
-	u32			limit;
+	int			padded;
+	atomic_t		refcnt;
+
+	spinlock_t		busylock ____cacheline_aligned_in_smp;
 };
 
 static inline bool qdisc_is_running(const struct Qdisc *qdisc)