| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 1 | /* | 
|  | 2 | * net/sched/estimator.c	Simple rate estimator. | 
|  | 3 | * | 
|  | 4 | *		This program is free software; you can redistribute it and/or | 
|  | 5 | *		modify it under the terms of the GNU General Public License | 
|  | 6 | *		as published by the Free Software Foundation; either version | 
|  | 7 | *		2 of the License, or (at your option) any later version. | 
|  | 8 | * | 
|  | 9 | * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> | 
|  | 10 | */ | 
|  | 11 |  | 
|  | 12 | #include <asm/uaccess.h> | 
|  | 13 | #include <asm/system.h> | 
|  | 14 | #include <linux/bitops.h> | 
|  | 15 | #include <linux/module.h> | 
|  | 16 | #include <linux/types.h> | 
|  | 17 | #include <linux/kernel.h> | 
|  | 18 | #include <linux/jiffies.h> | 
|  | 19 | #include <linux/string.h> | 
|  | 20 | #include <linux/mm.h> | 
|  | 21 | #include <linux/socket.h> | 
|  | 22 | #include <linux/sockios.h> | 
|  | 23 | #include <linux/in.h> | 
|  | 24 | #include <linux/errno.h> | 
|  | 25 | #include <linux/interrupt.h> | 
|  | 26 | #include <linux/netdevice.h> | 
|  | 27 | #include <linux/skbuff.h> | 
|  | 28 | #include <linux/rtnetlink.h> | 
|  | 29 | #include <linux/init.h> | 
|  | 30 | #include <net/sock.h> | 
|  | 31 | #include <net/pkt_sched.h> | 
|  | 32 |  | 
|  | 33 | /* | 
|  | 34 | This code is NOT intended to be used for statistics collection, | 
|  | 35 | its purpose is to provide a base for statistical multiplexing | 
|  | 36 | for controlled load service. | 
|  | 37 | If you need only statistics, run a user level daemon which | 
|  | 38 | periodically reads byte counters. | 
|  | 39 |  | 
|  | 40 | Unfortunately, rate estimation is not a very easy task. | 
|  | 41 | F.e. I did not find a simple way to estimate the current peak rate | 
|  | 42 | and even failed to formulate the problem 8)8) | 
|  | 43 |  | 
|  | 44 | So I preferred not to built an estimator into the scheduler, | 
|  | 45 | but run this task separately. | 
|  | 46 | Ideally, it should be kernel thread(s), but for now it runs | 
|  | 47 | from timers, which puts apparent top bounds on the number of rated | 
|  | 48 | flows, has minimal overhead on small, but is enough | 
|  | 49 | to handle controlled load service, sets of aggregates. | 
|  | 50 |  | 
|  | 51 | We measure rate over A=(1<<interval) seconds and evaluate EWMA: | 
|  | 52 |  | 
|  | 53 | avrate = avrate*(1-W) + rate*W | 
|  | 54 |  | 
|  | 55 | where W is chosen as negative power of 2: W = 2^(-ewma_log) | 
|  | 56 |  | 
|  | 57 | The resulting time constant is: | 
|  | 58 |  | 
|  | 59 | T = A/(-ln(1-W)) | 
|  | 60 |  | 
|  | 61 |  | 
|  | 62 | NOTES. | 
|  | 63 |  | 
|  | 64 | * The stored value for avbps is scaled by 2^5, so that maximal | 
|  | 65 | rate is ~1Gbit, avpps is scaled by 2^10. | 
|  | 66 |  | 
|  | 67 | * Minimal interval is HZ/4=250msec (it is the greatest common divisor | 
|  | 68 | for HZ=100 and HZ=1024 8)), maximal interval | 
|  | 69 | is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals | 
|  | 70 | are too expensive, longer ones can be implemented | 
|  | 71 | at user level painlessly. | 
|  | 72 | */ | 
|  | 73 |  | 
|  | 74 | #define EST_MAX_INTERVAL	5 | 
|  | 75 |  | 
|  | 76 | struct qdisc_estimator | 
|  | 77 | { | 
|  | 78 | struct qdisc_estimator	*next; | 
|  | 79 | struct tc_stats		*stats; | 
|  | 80 | spinlock_t		*stats_lock; | 
|  | 81 | unsigned		interval; | 
|  | 82 | int			ewma_log; | 
|  | 83 | u64			last_bytes; | 
|  | 84 | u32			last_packets; | 
|  | 85 | u32			avpps; | 
|  | 86 | u32			avbps; | 
|  | 87 | }; | 
|  | 88 |  | 
|  | 89 | struct qdisc_estimator_head | 
|  | 90 | { | 
|  | 91 | struct timer_list	timer; | 
|  | 92 | struct qdisc_estimator	*list; | 
|  | 93 | }; | 
|  | 94 |  | 
|  | 95 | static struct qdisc_estimator_head elist[EST_MAX_INTERVAL+1]; | 
|  | 96 |  | 
|  | 97 | /* Estimator array lock */ | 
|  | 98 | static DEFINE_RWLOCK(est_lock); | 
|  | 99 |  | 
|  | 100 | static void est_timer(unsigned long arg) | 
|  | 101 | { | 
|  | 102 | int idx = (int)arg; | 
|  | 103 | struct qdisc_estimator *e; | 
|  | 104 |  | 
|  | 105 | read_lock(&est_lock); | 
|  | 106 | for (e = elist[idx].list; e; e = e->next) { | 
|  | 107 | struct tc_stats *st = e->stats; | 
|  | 108 | u64 nbytes; | 
|  | 109 | u32 npackets; | 
|  | 110 | u32 rate; | 
|  | 111 |  | 
|  | 112 | spin_lock(e->stats_lock); | 
|  | 113 | nbytes = st->bytes; | 
|  | 114 | npackets = st->packets; | 
|  | 115 | rate = (nbytes - e->last_bytes)<<(7 - idx); | 
|  | 116 | e->last_bytes = nbytes; | 
|  | 117 | e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log; | 
|  | 118 | st->bps = (e->avbps+0xF)>>5; | 
|  | 119 |  | 
|  | 120 | rate = (npackets - e->last_packets)<<(12 - idx); | 
|  | 121 | e->last_packets = npackets; | 
|  | 122 | e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log; | 
|  | 123 | e->stats->pps = (e->avpps+0x1FF)>>10; | 
|  | 124 | spin_unlock(e->stats_lock); | 
|  | 125 | } | 
|  | 126 |  | 
|  | 127 | mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4)); | 
|  | 128 | read_unlock(&est_lock); | 
|  | 129 | } | 
|  | 130 |  | 
|  | 131 | int qdisc_new_estimator(struct tc_stats *stats, spinlock_t *stats_lock, struct rtattr *opt) | 
|  | 132 | { | 
|  | 133 | struct qdisc_estimator *est; | 
|  | 134 | struct tc_estimator *parm = RTA_DATA(opt); | 
|  | 135 |  | 
|  | 136 | if (RTA_PAYLOAD(opt) < sizeof(*parm)) | 
|  | 137 | return -EINVAL; | 
|  | 138 |  | 
|  | 139 | if (parm->interval < -2 || parm->interval > 3) | 
|  | 140 | return -EINVAL; | 
|  | 141 |  | 
|  | 142 | est = kmalloc(sizeof(*est), GFP_KERNEL); | 
|  | 143 | if (est == NULL) | 
|  | 144 | return -ENOBUFS; | 
|  | 145 |  | 
|  | 146 | memset(est, 0, sizeof(*est)); | 
|  | 147 | est->interval = parm->interval + 2; | 
|  | 148 | est->stats = stats; | 
|  | 149 | est->stats_lock = stats_lock; | 
|  | 150 | est->ewma_log = parm->ewma_log; | 
|  | 151 | est->last_bytes = stats->bytes; | 
|  | 152 | est->avbps = stats->bps<<5; | 
|  | 153 | est->last_packets = stats->packets; | 
|  | 154 | est->avpps = stats->pps<<10; | 
|  | 155 |  | 
|  | 156 | est->next = elist[est->interval].list; | 
|  | 157 | if (est->next == NULL) { | 
|  | 158 | init_timer(&elist[est->interval].timer); | 
|  | 159 | elist[est->interval].timer.data = est->interval; | 
|  | 160 | elist[est->interval].timer.expires = jiffies + ((HZ<<est->interval)/4); | 
|  | 161 | elist[est->interval].timer.function = est_timer; | 
|  | 162 | add_timer(&elist[est->interval].timer); | 
|  | 163 | } | 
|  | 164 | write_lock_bh(&est_lock); | 
|  | 165 | elist[est->interval].list = est; | 
|  | 166 | write_unlock_bh(&est_lock); | 
|  | 167 | return 0; | 
|  | 168 | } | 
|  | 169 |  | 
|  | 170 | void qdisc_kill_estimator(struct tc_stats *stats) | 
|  | 171 | { | 
|  | 172 | int idx; | 
|  | 173 | struct qdisc_estimator *est, **pest; | 
|  | 174 |  | 
|  | 175 | for (idx=0; idx <= EST_MAX_INTERVAL; idx++) { | 
|  | 176 | int killed = 0; | 
|  | 177 | pest = &elist[idx].list; | 
|  | 178 | while ((est=*pest) != NULL) { | 
|  | 179 | if (est->stats != stats) { | 
|  | 180 | pest = &est->next; | 
|  | 181 | continue; | 
|  | 182 | } | 
|  | 183 |  | 
|  | 184 | write_lock_bh(&est_lock); | 
|  | 185 | *pest = est->next; | 
|  | 186 | write_unlock_bh(&est_lock); | 
|  | 187 |  | 
|  | 188 | kfree(est); | 
|  | 189 | killed++; | 
|  | 190 | } | 
|  | 191 | if (killed && elist[idx].list == NULL) | 
|  | 192 | del_timer(&elist[idx].timer); | 
|  | 193 | } | 
|  | 194 | } | 
|  | 195 |  | 
|  | 196 | EXPORT_SYMBOL(qdisc_kill_estimator); | 
|  | 197 | EXPORT_SYMBOL(qdisc_new_estimator); |