| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 1 | /* | 
 | 2 |  * net/sched/estimator.c	Simple rate estimator. | 
 | 3 |  * | 
 | 4 |  *		This program is free software; you can redistribute it and/or | 
 | 5 |  *		modify it under the terms of the GNU General Public License | 
 | 6 |  *		as published by the Free Software Foundation; either version | 
 | 7 |  *		2 of the License, or (at your option) any later version. | 
 | 8 |  * | 
 | 9 |  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> | 
 | 10 |  */ | 
 | 11 |  | 
 | 12 | #include <asm/uaccess.h> | 
 | 13 | #include <asm/system.h> | 
 | 14 | #include <linux/bitops.h> | 
 | 15 | #include <linux/module.h> | 
 | 16 | #include <linux/types.h> | 
 | 17 | #include <linux/kernel.h> | 
 | 18 | #include <linux/jiffies.h> | 
 | 19 | #include <linux/string.h> | 
 | 20 | #include <linux/mm.h> | 
 | 21 | #include <linux/socket.h> | 
 | 22 | #include <linux/sockios.h> | 
 | 23 | #include <linux/in.h> | 
 | 24 | #include <linux/errno.h> | 
 | 25 | #include <linux/interrupt.h> | 
 | 26 | #include <linux/netdevice.h> | 
 | 27 | #include <linux/skbuff.h> | 
 | 28 | #include <linux/rtnetlink.h> | 
 | 29 | #include <linux/init.h> | 
 | 30 | #include <net/sock.h> | 
 | 31 | #include <net/pkt_sched.h> | 
 | 32 |  | 
 | 33 | /* | 
 | 34 |    This code is NOT intended to be used for statistics collection, | 
 | 35 |    its purpose is to provide a base for statistical multiplexing | 
 | 36 |    for controlled load service. | 
 | 37 |    If you need only statistics, run a user level daemon which | 
 | 38 |    periodically reads byte counters. | 
 | 39 |  | 
 | 40 |    Unfortunately, rate estimation is not a very easy task. | 
 | 41 |    F.e. I did not find a simple way to estimate the current peak rate | 
 | 42 |    and even failed to formulate the problem 8)8) | 
 | 43 |  | 
 | 44 |    So I preferred not to built an estimator into the scheduler, | 
 | 45 |    but run this task separately. | 
 | 46 |    Ideally, it should be kernel thread(s), but for now it runs | 
 | 47 |    from timers, which puts apparent top bounds on the number of rated | 
 | 48 |    flows, has minimal overhead on small, but is enough | 
 | 49 |    to handle controlled load service, sets of aggregates. | 
 | 50 |  | 
 | 51 |    We measure rate over A=(1<<interval) seconds and evaluate EWMA: | 
 | 52 |  | 
 | 53 |    avrate = avrate*(1-W) + rate*W | 
 | 54 |  | 
 | 55 |    where W is chosen as negative power of 2: W = 2^(-ewma_log) | 
 | 56 |  | 
 | 57 |    The resulting time constant is: | 
 | 58 |  | 
 | 59 |    T = A/(-ln(1-W)) | 
 | 60 |  | 
 | 61 |  | 
 | 62 |    NOTES. | 
 | 63 |  | 
 | 64 |    * The stored value for avbps is scaled by 2^5, so that maximal | 
 | 65 |      rate is ~1Gbit, avpps is scaled by 2^10. | 
 | 66 |  | 
 | 67 |    * Minimal interval is HZ/4=250msec (it is the greatest common divisor | 
 | 68 |      for HZ=100 and HZ=1024 8)), maximal interval | 
 | 69 |      is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals | 
 | 70 |      are too expensive, longer ones can be implemented | 
 | 71 |      at user level painlessly. | 
 | 72 |  */ | 
 | 73 |  | 
 | 74 | #define EST_MAX_INTERVAL	5 | 
 | 75 |  | 
 | 76 | struct qdisc_estimator | 
 | 77 | { | 
 | 78 | 	struct qdisc_estimator	*next; | 
 | 79 | 	struct tc_stats		*stats; | 
 | 80 | 	spinlock_t		*stats_lock; | 
 | 81 | 	unsigned		interval; | 
 | 82 | 	int			ewma_log; | 
 | 83 | 	u64			last_bytes; | 
 | 84 | 	u32			last_packets; | 
 | 85 | 	u32			avpps; | 
 | 86 | 	u32			avbps; | 
 | 87 | }; | 
 | 88 |  | 
 | 89 | struct qdisc_estimator_head | 
 | 90 | { | 
 | 91 | 	struct timer_list	timer; | 
 | 92 | 	struct qdisc_estimator	*list; | 
 | 93 | }; | 
 | 94 |  | 
 | 95 | static struct qdisc_estimator_head elist[EST_MAX_INTERVAL+1]; | 
 | 96 |  | 
 | 97 | /* Estimator array lock */ | 
 | 98 | static DEFINE_RWLOCK(est_lock); | 
 | 99 |  | 
 | 100 | static void est_timer(unsigned long arg) | 
 | 101 | { | 
 | 102 | 	int idx = (int)arg; | 
 | 103 | 	struct qdisc_estimator *e; | 
 | 104 |  | 
 | 105 | 	read_lock(&est_lock); | 
 | 106 | 	for (e = elist[idx].list; e; e = e->next) { | 
 | 107 | 		struct tc_stats *st = e->stats; | 
 | 108 | 		u64 nbytes; | 
 | 109 | 		u32 npackets; | 
 | 110 | 		u32 rate; | 
 | 111 |  | 
 | 112 | 		spin_lock(e->stats_lock); | 
 | 113 | 		nbytes = st->bytes; | 
 | 114 | 		npackets = st->packets; | 
 | 115 | 		rate = (nbytes - e->last_bytes)<<(7 - idx); | 
 | 116 | 		e->last_bytes = nbytes; | 
 | 117 | 		e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log; | 
 | 118 | 		st->bps = (e->avbps+0xF)>>5; | 
 | 119 |  | 
 | 120 | 		rate = (npackets - e->last_packets)<<(12 - idx); | 
 | 121 | 		e->last_packets = npackets; | 
 | 122 | 		e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log; | 
 | 123 | 		e->stats->pps = (e->avpps+0x1FF)>>10; | 
 | 124 | 		spin_unlock(e->stats_lock); | 
 | 125 | 	} | 
 | 126 |  | 
 | 127 | 	mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4)); | 
 | 128 | 	read_unlock(&est_lock); | 
 | 129 | } | 
 | 130 |  | 
 | 131 | int qdisc_new_estimator(struct tc_stats *stats, spinlock_t *stats_lock, struct rtattr *opt) | 
 | 132 | { | 
 | 133 | 	struct qdisc_estimator *est; | 
 | 134 | 	struct tc_estimator *parm = RTA_DATA(opt); | 
 | 135 |  | 
 | 136 | 	if (RTA_PAYLOAD(opt) < sizeof(*parm)) | 
 | 137 | 		return -EINVAL; | 
 | 138 |  | 
 | 139 | 	if (parm->interval < -2 || parm->interval > 3) | 
 | 140 | 		return -EINVAL; | 
 | 141 |  | 
 | 142 | 	est = kmalloc(sizeof(*est), GFP_KERNEL); | 
 | 143 | 	if (est == NULL) | 
 | 144 | 		return -ENOBUFS; | 
 | 145 |  | 
 | 146 | 	memset(est, 0, sizeof(*est)); | 
 | 147 | 	est->interval = parm->interval + 2; | 
 | 148 | 	est->stats = stats; | 
 | 149 | 	est->stats_lock = stats_lock; | 
 | 150 | 	est->ewma_log = parm->ewma_log; | 
 | 151 | 	est->last_bytes = stats->bytes; | 
 | 152 | 	est->avbps = stats->bps<<5; | 
 | 153 | 	est->last_packets = stats->packets; | 
 | 154 | 	est->avpps = stats->pps<<10; | 
 | 155 |  | 
 | 156 | 	est->next = elist[est->interval].list; | 
 | 157 | 	if (est->next == NULL) { | 
 | 158 | 		init_timer(&elist[est->interval].timer); | 
 | 159 | 		elist[est->interval].timer.data = est->interval; | 
 | 160 | 		elist[est->interval].timer.expires = jiffies + ((HZ<<est->interval)/4); | 
 | 161 | 		elist[est->interval].timer.function = est_timer; | 
 | 162 | 		add_timer(&elist[est->interval].timer); | 
 | 163 | 	} | 
 | 164 | 	write_lock_bh(&est_lock); | 
 | 165 | 	elist[est->interval].list = est; | 
 | 166 | 	write_unlock_bh(&est_lock); | 
 | 167 | 	return 0; | 
 | 168 | } | 
 | 169 |  | 
 | 170 | void qdisc_kill_estimator(struct tc_stats *stats) | 
 | 171 | { | 
 | 172 | 	int idx; | 
 | 173 | 	struct qdisc_estimator *est, **pest; | 
 | 174 |  | 
 | 175 | 	for (idx=0; idx <= EST_MAX_INTERVAL; idx++) { | 
 | 176 | 		int killed = 0; | 
 | 177 | 		pest = &elist[idx].list; | 
 | 178 | 		while ((est=*pest) != NULL) { | 
 | 179 | 			if (est->stats != stats) { | 
 | 180 | 				pest = &est->next; | 
 | 181 | 				continue; | 
 | 182 | 			} | 
 | 183 |  | 
 | 184 | 			write_lock_bh(&est_lock); | 
 | 185 | 			*pest = est->next; | 
 | 186 | 			write_unlock_bh(&est_lock); | 
 | 187 |  | 
 | 188 | 			kfree(est); | 
 | 189 | 			killed++; | 
 | 190 | 		} | 
 | 191 | 		if (killed && elist[idx].list == NULL) | 
 | 192 | 			del_timer(&elist[idx].timer); | 
 | 193 | 	} | 
 | 194 | } | 
 | 195 |  | 
 | 196 | EXPORT_SYMBOL(qdisc_kill_estimator); | 
 | 197 | EXPORT_SYMBOL(qdisc_new_estimator); |