blob: 8e8c5becc348c4439d2972d98d60c26d831971c4 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * net/sched/sch_api.c Packet scheduler API.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Fixes:
12 *
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16 */
17
Linus Torvalds1da177e2005-04-16 15:20:36 -070018#include <linux/module.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070021#include <linux/string.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070022#include <linux/errno.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070023#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070024#include <linux/init.h>
25#include <linux/proc_fs.h>
26#include <linux/seq_file.h>
27#include <linux/kmod.h>
28#include <linux/list.h>
Patrick McHardy41794772007-03-16 01:19:15 -070029#include <linux/hrtimer.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070030
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020031#include <net/net_namespace.h>
Denis V. Lunevb8542722007-12-01 00:21:31 +110032#include <net/sock.h>
Arnaldo Carvalho de Melodc5fc572007-03-25 23:06:12 -070033#include <net/netlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070034#include <net/pkt_sched.h>
35
Linus Torvalds1da177e2005-04-16 15:20:36 -070036static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
37 struct Qdisc *old, struct Qdisc *new);
38static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
39 struct Qdisc *q, unsigned long cl, int event);
40
41/*
42
43 Short review.
44 -------------
45
46 This file consists of two interrelated parts:
47
48 1. queueing disciplines manager frontend.
49 2. traffic classes manager frontend.
50
51 Generally, queueing discipline ("qdisc") is a black box,
52 which is able to enqueue packets and to dequeue them (when
53 device is ready to send something) in order and at times
54 determined by algorithm hidden in it.
55
56 qdisc's are divided to two categories:
57 - "queues", which have no internal structure visible from outside.
58 - "schedulers", which split all the packets to "traffic classes",
59 using "packet classifiers" (look at cls_api.c)
60
61 In turn, classes may have child qdiscs (as rule, queues)
62 attached to them etc. etc. etc.
63
64 The goal of the routines in this file is to translate
65 information supplied by user in the form of handles
66 to more intelligible for kernel form, to make some sanity
67 checks and part of work, which is common to all qdiscs
68 and to provide rtnetlink notifications.
69
70 All real intelligent work is done inside qdisc modules.
71
72
73
74 Every discipline has two major routines: enqueue and dequeue.
75
76 ---dequeue
77
78 dequeue usually returns a skb to send. It is allowed to return NULL,
79 but it does not mean that queue is empty, it just means that
80 discipline does not want to send anything this time.
81 Queue is really empty if q->q.qlen == 0.
82 For complicated disciplines with multiple queues q->q is not
83 real packet queue, but however q->q.qlen must be valid.
84
85 ---enqueue
86
87 enqueue returns 0, if packet was enqueued successfully.
88 If packet (this one or another one) was dropped, it returns
89 not zero error code.
90 NET_XMIT_DROP - this packet dropped
91 Expected action: do not backoff, but wait until queue will clear.
92 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
93 Expected action: backoff or ignore
94 NET_XMIT_POLICED - dropped by police.
95 Expected action: backoff or error to real-time apps.
96
97 Auxiliary routines:
98
99 ---requeue
100
101 requeues once dequeued packet. It is used for non-standard or
David S. Millere65d22e2008-07-08 16:46:01 -0700102 just buggy devices, which can defer output even if netif_queue_stopped()=0.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700103
104 ---reset
105
106 returns qdisc to initial state: purge all buffers, clear all
107 timers, counters (except for statistics) etc.
108
109 ---init
110
111 initializes newly created qdisc.
112
113 ---destroy
114
115 destroys resources allocated by init and during lifetime of qdisc.
116
117 ---change
118
119 changes qdisc parameters.
120 */
121
122/* Protects list of registered TC modules. It is pure SMP lock. */
123static DEFINE_RWLOCK(qdisc_mod_lock);
124
125
126/************************************************
127 * Queueing disciplines manipulation. *
128 ************************************************/
129
130
131/* The list of all installed queueing disciplines. */
132
133static struct Qdisc_ops *qdisc_base;
134
135/* Register/uregister queueing discipline */
136
137int register_qdisc(struct Qdisc_ops *qops)
138{
139 struct Qdisc_ops *q, **qp;
140 int rc = -EEXIST;
141
142 write_lock(&qdisc_mod_lock);
143 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
144 if (!strcmp(qops->id, q->id))
145 goto out;
146
147 if (qops->enqueue == NULL)
148 qops->enqueue = noop_qdisc_ops.enqueue;
149 if (qops->requeue == NULL)
150 qops->requeue = noop_qdisc_ops.requeue;
151 if (qops->dequeue == NULL)
152 qops->dequeue = noop_qdisc_ops.dequeue;
153
154 qops->next = NULL;
155 *qp = qops;
156 rc = 0;
157out:
158 write_unlock(&qdisc_mod_lock);
159 return rc;
160}
Patrick McHardy62e3ba12008-01-22 22:10:23 -0800161EXPORT_SYMBOL(register_qdisc);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700162
163int unregister_qdisc(struct Qdisc_ops *qops)
164{
165 struct Qdisc_ops *q, **qp;
166 int err = -ENOENT;
167
168 write_lock(&qdisc_mod_lock);
169 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
170 if (q == qops)
171 break;
172 if (q) {
173 *qp = q->next;
174 q->next = NULL;
175 err = 0;
176 }
177 write_unlock(&qdisc_mod_lock);
178 return err;
179}
Patrick McHardy62e3ba12008-01-22 22:10:23 -0800180EXPORT_SYMBOL(unregister_qdisc);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700181
182/* We know handle. Find qdisc among all qdisc's attached to device
183 (root qdisc, all its children, children of children etc.)
184 */
185
David S. Millere8a04642008-07-17 00:34:19 -0700186static struct Qdisc *__qdisc_lookup(struct netdev_queue *dev_queue, u32 handle)
Patrick McHardy43effa12006-11-29 17:35:48 -0800187{
188 struct Qdisc *q;
189
David S. Millerb0e1e642008-07-08 17:42:10 -0700190 list_for_each_entry(q, &dev_queue->qdisc_list, list) {
Patrick McHardy43effa12006-11-29 17:35:48 -0800191 if (q->handle == handle)
192 return q;
193 }
194 return NULL;
195}
196
David S. Millere8a04642008-07-17 00:34:19 -0700197struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
198{
199 unsigned int i;
200
201 for (i = 0; i < dev->num_tx_queues; i++) {
202 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
203 struct Qdisc *q = __qdisc_lookup(txq, handle);
204 if (q)
205 return q;
206 }
207 return NULL;
208}
209
Linus Torvalds1da177e2005-04-16 15:20:36 -0700210static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
211{
212 unsigned long cl;
213 struct Qdisc *leaf;
Eric Dumazet20fea082007-11-14 01:44:41 -0800214 const struct Qdisc_class_ops *cops = p->ops->cl_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700215
216 if (cops == NULL)
217 return NULL;
218 cl = cops->get(p, classid);
219
220 if (cl == 0)
221 return NULL;
222 leaf = cops->leaf(p, cl);
223 cops->put(p, cl);
224 return leaf;
225}
226
227/* Find queueing discipline by name */
228
Patrick McHardy1e904742008-01-22 22:11:17 -0800229static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700230{
231 struct Qdisc_ops *q = NULL;
232
233 if (kind) {
234 read_lock(&qdisc_mod_lock);
235 for (q = qdisc_base; q; q = q->next) {
Patrick McHardy1e904742008-01-22 22:11:17 -0800236 if (nla_strcmp(kind, q->id) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700237 if (!try_module_get(q->owner))
238 q = NULL;
239 break;
240 }
241 }
242 read_unlock(&qdisc_mod_lock);
243 }
244 return q;
245}
246
247static struct qdisc_rate_table *qdisc_rtab_list;
248
Patrick McHardy1e904742008-01-22 22:11:17 -0800249struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700250{
251 struct qdisc_rate_table *rtab;
252
253 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
254 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
255 rtab->refcnt++;
256 return rtab;
257 }
258 }
259
Patrick McHardy5feb5e12008-01-23 20:35:19 -0800260 if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
261 nla_len(tab) != TC_RTAB_SIZE)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700262 return NULL;
263
264 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
265 if (rtab) {
266 rtab->rate = *r;
267 rtab->refcnt = 1;
Patrick McHardy1e904742008-01-22 22:11:17 -0800268 memcpy(rtab->data, nla_data(tab), 1024);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700269 rtab->next = qdisc_rtab_list;
270 qdisc_rtab_list = rtab;
271 }
272 return rtab;
273}
Patrick McHardy62e3ba12008-01-22 22:10:23 -0800274EXPORT_SYMBOL(qdisc_get_rtab);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700275
276void qdisc_put_rtab(struct qdisc_rate_table *tab)
277{
278 struct qdisc_rate_table *rtab, **rtabp;
279
280 if (!tab || --tab->refcnt)
281 return;
282
283 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
284 if (rtab == tab) {
285 *rtabp = rtab->next;
286 kfree(rtab);
287 return;
288 }
289 }
290}
Patrick McHardy62e3ba12008-01-22 22:10:23 -0800291EXPORT_SYMBOL(qdisc_put_rtab);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700292
Patrick McHardy41794772007-03-16 01:19:15 -0700293static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
294{
295 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
296 timer);
297
298 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
Stephen Hemminger11274e52007-03-22 12:17:42 -0700299 smp_wmb();
David S. Miller37437bb2008-07-16 02:15:04 -0700300 __netif_schedule(wd->qdisc);
Stephen Hemminger19365022007-03-22 12:18:35 -0700301
Patrick McHardy41794772007-03-16 01:19:15 -0700302 return HRTIMER_NORESTART;
303}
304
305void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
306{
307 hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
308 wd->timer.function = qdisc_watchdog;
309 wd->qdisc = qdisc;
310}
311EXPORT_SYMBOL(qdisc_watchdog_init);
312
313void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
314{
315 ktime_t time;
316
317 wd->qdisc->flags |= TCQ_F_THROTTLED;
318 time = ktime_set(0, 0);
319 time = ktime_add_ns(time, PSCHED_US2NS(expires));
320 hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
321}
322EXPORT_SYMBOL(qdisc_watchdog_schedule);
323
324void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
325{
326 hrtimer_cancel(&wd->timer);
327 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
328}
329EXPORT_SYMBOL(qdisc_watchdog_cancel);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700330
Patrick McHardy6fe1c7a2008-07-05 23:21:31 -0700331struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
332{
333 unsigned int size = n * sizeof(struct hlist_head), i;
334 struct hlist_head *h;
335
336 if (size <= PAGE_SIZE)
337 h = kmalloc(size, GFP_KERNEL);
338 else
339 h = (struct hlist_head *)
340 __get_free_pages(GFP_KERNEL, get_order(size));
341
342 if (h != NULL) {
343 for (i = 0; i < n; i++)
344 INIT_HLIST_HEAD(&h[i]);
345 }
346 return h;
347}
348
349static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
350{
351 unsigned int size = n * sizeof(struct hlist_head);
352
353 if (size <= PAGE_SIZE)
354 kfree(h);
355 else
356 free_pages((unsigned long)h, get_order(size));
357}
358
359void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
360{
361 struct Qdisc_class_common *cl;
362 struct hlist_node *n, *next;
363 struct hlist_head *nhash, *ohash;
364 unsigned int nsize, nmask, osize;
365 unsigned int i, h;
366
367 /* Rehash when load factor exceeds 0.75 */
368 if (clhash->hashelems * 4 <= clhash->hashsize * 3)
369 return;
370 nsize = clhash->hashsize * 2;
371 nmask = nsize - 1;
372 nhash = qdisc_class_hash_alloc(nsize);
373 if (nhash == NULL)
374 return;
375
376 ohash = clhash->hash;
377 osize = clhash->hashsize;
378
379 sch_tree_lock(sch);
380 for (i = 0; i < osize; i++) {
381 hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
382 h = qdisc_class_hash(cl->classid, nmask);
383 hlist_add_head(&cl->hnode, &nhash[h]);
384 }
385 }
386 clhash->hash = nhash;
387 clhash->hashsize = nsize;
388 clhash->hashmask = nmask;
389 sch_tree_unlock(sch);
390
391 qdisc_class_hash_free(ohash, osize);
392}
393EXPORT_SYMBOL(qdisc_class_hash_grow);
394
395int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
396{
397 unsigned int size = 4;
398
399 clhash->hash = qdisc_class_hash_alloc(size);
400 if (clhash->hash == NULL)
401 return -ENOMEM;
402 clhash->hashsize = size;
403 clhash->hashmask = size - 1;
404 clhash->hashelems = 0;
405 return 0;
406}
407EXPORT_SYMBOL(qdisc_class_hash_init);
408
409void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
410{
411 qdisc_class_hash_free(clhash->hash, clhash->hashsize);
412}
413EXPORT_SYMBOL(qdisc_class_hash_destroy);
414
415void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
416 struct Qdisc_class_common *cl)
417{
418 unsigned int h;
419
420 INIT_HLIST_NODE(&cl->hnode);
421 h = qdisc_class_hash(cl->classid, clhash->hashmask);
422 hlist_add_head(&cl->hnode, &clhash->hash[h]);
423 clhash->hashelems++;
424}
425EXPORT_SYMBOL(qdisc_class_hash_insert);
426
427void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
428 struct Qdisc_class_common *cl)
429{
430 hlist_del(&cl->hnode);
431 clhash->hashelems--;
432}
433EXPORT_SYMBOL(qdisc_class_hash_remove);
434
Linus Torvalds1da177e2005-04-16 15:20:36 -0700435/* Allocate an unique handle from space managed by kernel */
436
437static u32 qdisc_alloc_handle(struct net_device *dev)
438{
439 int i = 0x10000;
440 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
441
442 do {
443 autohandle += TC_H_MAKE(0x10000U, 0);
444 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
445 autohandle = TC_H_MAKE(0x80000000U, 0);
446 } while (qdisc_lookup(dev, autohandle) && --i > 0);
447
448 return i>0 ? autohandle : 0;
449}
450
451/* Attach toplevel qdisc to device dev */
452
453static struct Qdisc *
454dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
455{
David S. Millerb0e1e642008-07-08 17:42:10 -0700456 struct netdev_queue *dev_queue;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700457 struct Qdisc *oqdisc;
458
459 if (dev->flags & IFF_UP)
460 dev_deactivate(dev);
461
462 qdisc_lock_tree(dev);
463 if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
David S. Miller816f3252008-07-08 22:49:00 -0700464 dev_queue = &dev->rx_queue;
465 oqdisc = dev_queue->qdisc;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700466 /* Prune old scheduler */
467 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
468 /* delete */
469 qdisc_reset(oqdisc);
David S. Miller816f3252008-07-08 22:49:00 -0700470 dev_queue->qdisc = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700471 } else { /* new */
David S. Miller816f3252008-07-08 22:49:00 -0700472 dev_queue->qdisc = qdisc;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700473 }
474
475 } else {
David S. Millere8a04642008-07-17 00:34:19 -0700476 dev_queue = netdev_get_tx_queue(dev, 0);
David S. Millerb0e1e642008-07-08 17:42:10 -0700477 oqdisc = dev_queue->qdisc_sleeping;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700478
479 /* Prune old scheduler */
480 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
481 qdisc_reset(oqdisc);
482
483 /* ... and graft new one */
484 if (qdisc == NULL)
485 qdisc = &noop_qdisc;
David S. Millerb0e1e642008-07-08 17:42:10 -0700486 dev_queue->qdisc_sleeping = qdisc;
487 dev_queue->qdisc = &noop_qdisc;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700488 }
489
490 qdisc_unlock_tree(dev);
491
492 if (dev->flags & IFF_UP)
493 dev_activate(dev);
494
495 return oqdisc;
496}
497
Patrick McHardy43effa12006-11-29 17:35:48 -0800498void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
499{
Eric Dumazet20fea082007-11-14 01:44:41 -0800500 const struct Qdisc_class_ops *cops;
Patrick McHardy43effa12006-11-29 17:35:48 -0800501 unsigned long cl;
502 u32 parentid;
503
504 if (n == 0)
505 return;
506 while ((parentid = sch->parent)) {
Jarek Poplawski066a3b52008-04-14 15:10:42 -0700507 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
508 return;
509
David S. Miller5ce2d482008-07-08 17:06:30 -0700510 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
Patrick McHardyffc8fef2007-07-30 17:11:50 -0700511 if (sch == NULL) {
512 WARN_ON(parentid != TC_H_ROOT);
513 return;
514 }
Patrick McHardy43effa12006-11-29 17:35:48 -0800515 cops = sch->ops->cl_ops;
516 if (cops->qlen_notify) {
517 cl = cops->get(sch, parentid);
518 cops->qlen_notify(sch, cl);
519 cops->put(sch, cl);
520 }
521 sch->q.qlen -= n;
522 }
523}
524EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700525
526/* Graft qdisc "new" to class "classid" of qdisc "parent" or
527 to device "dev".
528
529 Old qdisc is not destroyed but returned in *old.
530 */
531
532static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
533 u32 classid,
534 struct Qdisc *new, struct Qdisc **old)
535{
536 int err = 0;
537 struct Qdisc *q = *old;
538
539
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900540 if (parent == NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700541 if (q && q->flags&TCQ_F_INGRESS) {
542 *old = dev_graft_qdisc(dev, q);
543 } else {
544 *old = dev_graft_qdisc(dev, new);
545 }
546 } else {
Eric Dumazet20fea082007-11-14 01:44:41 -0800547 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700548
549 err = -EINVAL;
550
551 if (cops) {
552 unsigned long cl = cops->get(parent, classid);
553 if (cl) {
554 err = cops->graft(parent, cl, new, old);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700555 cops->put(parent, cl);
556 }
557 }
558 }
559 return err;
560}
561
562/*
563 Allocate and initialize new qdisc.
564
565 Parameters are passed via opt.
566 */
567
568static struct Qdisc *
David S. Millerbb949fb2008-07-08 16:55:56 -0700569qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
570 u32 parent, u32 handle, struct nlattr **tca, int *errp)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700571{
572 int err;
Patrick McHardy1e904742008-01-22 22:11:17 -0800573 struct nlattr *kind = tca[TCA_KIND];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700574 struct Qdisc *sch;
575 struct Qdisc_ops *ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700576
577 ops = qdisc_lookup_ops(kind);
578#ifdef CONFIG_KMOD
579 if (ops == NULL && kind != NULL) {
580 char name[IFNAMSIZ];
Patrick McHardy1e904742008-01-22 22:11:17 -0800581 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700582 /* We dropped the RTNL semaphore in order to
583 * perform the module load. So, even if we
584 * succeeded in loading the module we have to
585 * tell the caller to replay the request. We
586 * indicate this using -EAGAIN.
587 * We replay the request because the device may
588 * go away in the mean time.
589 */
590 rtnl_unlock();
591 request_module("sch_%s", name);
592 rtnl_lock();
593 ops = qdisc_lookup_ops(kind);
594 if (ops != NULL) {
595 /* We will try again qdisc_lookup_ops,
596 * so don't keep a reference.
597 */
598 module_put(ops->owner);
599 err = -EAGAIN;
600 goto err_out;
601 }
602 }
603 }
604#endif
605
Jamal Hadi Salimb9e2cc02006-08-03 16:36:51 -0700606 err = -ENOENT;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700607 if (ops == NULL)
608 goto err_out;
609
David S. Miller5ce2d482008-07-08 17:06:30 -0700610 sch = qdisc_alloc(dev_queue, ops);
Thomas Graf3d54b822005-07-05 14:15:09 -0700611 if (IS_ERR(sch)) {
612 err = PTR_ERR(sch);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700613 goto err_out2;
Thomas Graf3d54b822005-07-05 14:15:09 -0700614 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700615
Patrick McHardyffc8fef2007-07-30 17:11:50 -0700616 sch->parent = parent;
617
Thomas Graf3d54b822005-07-05 14:15:09 -0700618 if (handle == TC_H_INGRESS) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700619 sch->flags |= TCQ_F_INGRESS;
Thomas Graf3d54b822005-07-05 14:15:09 -0700620 handle = TC_H_MAKE(TC_H_INGRESS, 0);
Patrick McHardyfd44de72007-04-16 17:07:08 -0700621 } else {
Patrick McHardyfd44de72007-04-16 17:07:08 -0700622 if (handle == 0) {
623 handle = qdisc_alloc_handle(dev);
624 err = -ENOMEM;
625 if (handle == 0)
626 goto err_out3;
627 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700628 }
629
Thomas Graf3d54b822005-07-05 14:15:09 -0700630 sch->handle = handle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700631
Patrick McHardy1e904742008-01-22 22:11:17 -0800632 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
633 if (tca[TCA_RATE]) {
Thomas Graf023e09a2005-07-05 14:15:53 -0700634 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
David S. Miller7698b4f2008-07-16 01:42:40 -0700635 qdisc_root_lock(sch),
Patrick McHardy1e904742008-01-22 22:11:17 -0800636 tca[TCA_RATE]);
Thomas Graf023e09a2005-07-05 14:15:53 -0700637 if (err) {
638 /*
639 * Any broken qdiscs that would require
640 * a ops->reset() here? The qdisc was never
641 * in action so it shouldn't be necessary.
642 */
643 if (ops->destroy)
644 ops->destroy(sch);
645 goto err_out3;
646 }
647 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700648 qdisc_lock_tree(dev);
David S. Millerb0e1e642008-07-08 17:42:10 -0700649 list_add_tail(&sch->list, &dev_queue->qdisc_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700650 qdisc_unlock_tree(dev);
651
Linus Torvalds1da177e2005-04-16 15:20:36 -0700652 return sch;
653 }
654err_out3:
655 dev_put(dev);
Thomas Graf3d54b822005-07-05 14:15:09 -0700656 kfree((char *) sch - sch->padded);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700657err_out2:
658 module_put(ops->owner);
659err_out:
660 *errp = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700661 return NULL;
662}
663
Patrick McHardy1e904742008-01-22 22:11:17 -0800664static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700665{
Patrick McHardy1e904742008-01-22 22:11:17 -0800666 if (tca[TCA_OPTIONS]) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700667 int err;
668
669 if (sch->ops->change == NULL)
670 return -EINVAL;
Patrick McHardy1e904742008-01-22 22:11:17 -0800671 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700672 if (err)
673 return err;
674 }
Patrick McHardy1e904742008-01-22 22:11:17 -0800675 if (tca[TCA_RATE])
Linus Torvalds1da177e2005-04-16 15:20:36 -0700676 gen_replace_estimator(&sch->bstats, &sch->rate_est,
David S. Miller7698b4f2008-07-16 01:42:40 -0700677 qdisc_root_lock(sch), tca[TCA_RATE]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700678 return 0;
679}
680
681struct check_loop_arg
682{
683 struct qdisc_walker w;
684 struct Qdisc *p;
685 int depth;
686};
687
688static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
689
690static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
691{
692 struct check_loop_arg arg;
693
694 if (q->ops->cl_ops == NULL)
695 return 0;
696
697 arg.w.stop = arg.w.skip = arg.w.count = 0;
698 arg.w.fn = check_loop_fn;
699 arg.depth = depth;
700 arg.p = p;
701 q->ops->cl_ops->walk(q, &arg.w);
702 return arg.w.stop ? -ELOOP : 0;
703}
704
705static int
706check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
707{
708 struct Qdisc *leaf;
Eric Dumazet20fea082007-11-14 01:44:41 -0800709 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700710 struct check_loop_arg *arg = (struct check_loop_arg *)w;
711
712 leaf = cops->leaf(q, cl);
713 if (leaf) {
714 if (leaf == arg->p || arg->depth > 7)
715 return -ELOOP;
716 return check_loop(leaf, arg->p, arg->depth + 1);
717 }
718 return 0;
719}
720
721/*
722 * Delete/get qdisc.
723 */
724
725static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
726{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +0900727 struct net *net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700728 struct tcmsg *tcm = NLMSG_DATA(n);
Patrick McHardy1e904742008-01-22 22:11:17 -0800729 struct nlattr *tca[TCA_MAX + 1];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700730 struct net_device *dev;
731 u32 clid = tcm->tcm_parent;
732 struct Qdisc *q = NULL;
733 struct Qdisc *p = NULL;
734 int err;
735
Denis V. Lunevb8542722007-12-01 00:21:31 +1100736 if (net != &init_net)
737 return -EINVAL;
738
Eric W. Biederman881d9662007-09-17 11:56:21 -0700739 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700740 return -ENODEV;
741
Patrick McHardy1e904742008-01-22 22:11:17 -0800742 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
743 if (err < 0)
744 return err;
745
Linus Torvalds1da177e2005-04-16 15:20:36 -0700746 if (clid) {
747 if (clid != TC_H_ROOT) {
748 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
749 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
750 return -ENOENT;
751 q = qdisc_leaf(p, clid);
752 } else { /* ingress */
David S. Miller816f3252008-07-08 22:49:00 -0700753 q = dev->rx_queue.qdisc;
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900754 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700755 } else {
David S. Millere8a04642008-07-17 00:34:19 -0700756 struct netdev_queue *dev_queue;
757 dev_queue = netdev_get_tx_queue(dev, 0);
David S. Millerb0e1e642008-07-08 17:42:10 -0700758 q = dev_queue->qdisc_sleeping;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700759 }
760 if (!q)
761 return -ENOENT;
762
763 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
764 return -EINVAL;
765 } else {
766 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
767 return -ENOENT;
768 }
769
Patrick McHardy1e904742008-01-22 22:11:17 -0800770 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700771 return -EINVAL;
772
773 if (n->nlmsg_type == RTM_DELQDISC) {
774 if (!clid)
775 return -EINVAL;
776 if (q->handle == 0)
777 return -ENOENT;
778 if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
779 return err;
780 if (q) {
781 qdisc_notify(skb, n, clid, q, NULL);
Patrick McHardyfd44de72007-04-16 17:07:08 -0700782 qdisc_lock_tree(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700783 qdisc_destroy(q);
Patrick McHardyfd44de72007-04-16 17:07:08 -0700784 qdisc_unlock_tree(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700785 }
786 } else {
787 qdisc_notify(skb, n, clid, NULL, q);
788 }
789 return 0;
790}
791
792/*
793 Create/change qdisc.
794 */
795
796static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
797{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +0900798 struct net *net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700799 struct tcmsg *tcm;
Patrick McHardy1e904742008-01-22 22:11:17 -0800800 struct nlattr *tca[TCA_MAX + 1];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700801 struct net_device *dev;
802 u32 clid;
803 struct Qdisc *q, *p;
804 int err;
805
Denis V. Lunevb8542722007-12-01 00:21:31 +1100806 if (net != &init_net)
807 return -EINVAL;
808
Linus Torvalds1da177e2005-04-16 15:20:36 -0700809replay:
810 /* Reinit, just in case something touches this. */
811 tcm = NLMSG_DATA(n);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700812 clid = tcm->tcm_parent;
813 q = p = NULL;
814
Eric W. Biederman881d9662007-09-17 11:56:21 -0700815 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700816 return -ENODEV;
817
Patrick McHardy1e904742008-01-22 22:11:17 -0800818 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
819 if (err < 0)
820 return err;
821
Linus Torvalds1da177e2005-04-16 15:20:36 -0700822 if (clid) {
823 if (clid != TC_H_ROOT) {
824 if (clid != TC_H_INGRESS) {
825 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
826 return -ENOENT;
827 q = qdisc_leaf(p, clid);
828 } else { /*ingress */
David S. Miller816f3252008-07-08 22:49:00 -0700829 q = dev->rx_queue.qdisc;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700830 }
831 } else {
David S. Millere8a04642008-07-17 00:34:19 -0700832 struct netdev_queue *dev_queue;
833 dev_queue = netdev_get_tx_queue(dev, 0);
David S. Millerb0e1e642008-07-08 17:42:10 -0700834 q = dev_queue->qdisc_sleeping;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700835 }
836
837 /* It may be default qdisc, ignore it */
838 if (q && q->handle == 0)
839 q = NULL;
840
841 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
842 if (tcm->tcm_handle) {
843 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
844 return -EEXIST;
845 if (TC_H_MIN(tcm->tcm_handle))
846 return -EINVAL;
847 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
848 goto create_n_graft;
849 if (n->nlmsg_flags&NLM_F_EXCL)
850 return -EEXIST;
Patrick McHardy1e904742008-01-22 22:11:17 -0800851 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700852 return -EINVAL;
853 if (q == p ||
854 (p && check_loop(q, p, 0)))
855 return -ELOOP;
856 atomic_inc(&q->refcnt);
857 goto graft;
858 } else {
859 if (q == NULL)
860 goto create_n_graft;
861
862 /* This magic test requires explanation.
863 *
864 * We know, that some child q is already
865 * attached to this parent and have choice:
866 * either to change it or to create/graft new one.
867 *
868 * 1. We are allowed to create/graft only
869 * if CREATE and REPLACE flags are set.
870 *
871 * 2. If EXCL is set, requestor wanted to say,
872 * that qdisc tcm_handle is not expected
873 * to exist, so that we choose create/graft too.
874 *
875 * 3. The last case is when no flags are set.
876 * Alas, it is sort of hole in API, we
877 * cannot decide what to do unambiguously.
878 * For now we select create/graft, if
879 * user gave KIND, which does not match existing.
880 */
881 if ((n->nlmsg_flags&NLM_F_CREATE) &&
882 (n->nlmsg_flags&NLM_F_REPLACE) &&
883 ((n->nlmsg_flags&NLM_F_EXCL) ||
Patrick McHardy1e904742008-01-22 22:11:17 -0800884 (tca[TCA_KIND] &&
885 nla_strcmp(tca[TCA_KIND], q->ops->id))))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700886 goto create_n_graft;
887 }
888 }
889 } else {
890 if (!tcm->tcm_handle)
891 return -EINVAL;
892 q = qdisc_lookup(dev, tcm->tcm_handle);
893 }
894
895 /* Change qdisc parameters */
896 if (q == NULL)
897 return -ENOENT;
898 if (n->nlmsg_flags&NLM_F_EXCL)
899 return -EEXIST;
Patrick McHardy1e904742008-01-22 22:11:17 -0800900 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700901 return -EINVAL;
902 err = qdisc_change(q, tca);
903 if (err == 0)
904 qdisc_notify(skb, n, clid, NULL, q);
905 return err;
906
907create_n_graft:
908 if (!(n->nlmsg_flags&NLM_F_CREATE))
909 return -ENOENT;
910 if (clid == TC_H_INGRESS)
David S. Millerbb949fb2008-07-08 16:55:56 -0700911 q = qdisc_create(dev, &dev->rx_queue,
912 tcm->tcm_parent, tcm->tcm_parent,
Patrick McHardyffc8fef2007-07-30 17:11:50 -0700913 tca, &err);
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900914 else
David S. Millere8a04642008-07-17 00:34:19 -0700915 q = qdisc_create(dev, netdev_get_tx_queue(dev, 0),
David S. Millerbb949fb2008-07-08 16:55:56 -0700916 tcm->tcm_parent, tcm->tcm_handle,
Patrick McHardyffc8fef2007-07-30 17:11:50 -0700917 tca, &err);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700918 if (q == NULL) {
919 if (err == -EAGAIN)
920 goto replay;
921 return err;
922 }
923
924graft:
925 if (1) {
926 struct Qdisc *old_q = NULL;
927 err = qdisc_graft(dev, p, clid, q, &old_q);
928 if (err) {
929 if (q) {
Patrick McHardyfd44de72007-04-16 17:07:08 -0700930 qdisc_lock_tree(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700931 qdisc_destroy(q);
Patrick McHardyfd44de72007-04-16 17:07:08 -0700932 qdisc_unlock_tree(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700933 }
934 return err;
935 }
936 qdisc_notify(skb, n, clid, old_q, q);
937 if (old_q) {
Patrick McHardyfd44de72007-04-16 17:07:08 -0700938 qdisc_lock_tree(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700939 qdisc_destroy(old_q);
Patrick McHardyfd44de72007-04-16 17:07:08 -0700940 qdisc_unlock_tree(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700941 }
942 }
943 return 0;
944}
945
946static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -0700947 u32 pid, u32 seq, u16 flags, int event)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700948{
949 struct tcmsg *tcm;
950 struct nlmsghdr *nlh;
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -0700951 unsigned char *b = skb_tail_pointer(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700952 struct gnet_dump d;
953
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -0700954 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700955 tcm = NLMSG_DATA(nlh);
956 tcm->tcm_family = AF_UNSPEC;
Patrick McHardy9ef1d4c2005-06-28 12:55:30 -0700957 tcm->tcm__pad1 = 0;
958 tcm->tcm__pad2 = 0;
David S. Miller5ce2d482008-07-08 17:06:30 -0700959 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700960 tcm->tcm_parent = clid;
961 tcm->tcm_handle = q->handle;
962 tcm->tcm_info = atomic_read(&q->refcnt);
Patrick McHardy57e1c482008-01-23 20:34:28 -0800963 NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700964 if (q->ops->dump && q->ops->dump(q, skb) < 0)
Patrick McHardy1e904742008-01-22 22:11:17 -0800965 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700966 q->qstats.qlen = q->q.qlen;
967
968 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
David S. Miller7698b4f2008-07-16 01:42:40 -0700969 TCA_XSTATS, qdisc_root_lock(q), &d) < 0)
Patrick McHardy1e904742008-01-22 22:11:17 -0800970 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700971
972 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
Patrick McHardy1e904742008-01-22 22:11:17 -0800973 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700974
975 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
Linus Torvalds1da177e2005-04-16 15:20:36 -0700976 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
Linus Torvalds1da177e2005-04-16 15:20:36 -0700977 gnet_stats_copy_queue(&d, &q->qstats) < 0)
Patrick McHardy1e904742008-01-22 22:11:17 -0800978 goto nla_put_failure;
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900979
Linus Torvalds1da177e2005-04-16 15:20:36 -0700980 if (gnet_stats_finish_copy(&d) < 0)
Patrick McHardy1e904742008-01-22 22:11:17 -0800981 goto nla_put_failure;
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900982
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -0700983 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700984 return skb->len;
985
986nlmsg_failure:
Patrick McHardy1e904742008-01-22 22:11:17 -0800987nla_put_failure:
Arnaldo Carvalho de Melodc5fc572007-03-25 23:06:12 -0700988 nlmsg_trim(skb, b);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700989 return -1;
990}
991
992static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
993 u32 clid, struct Qdisc *old, struct Qdisc *new)
994{
995 struct sk_buff *skb;
996 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
997
998 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
999 if (!skb)
1000 return -ENOBUFS;
1001
1002 if (old && old->handle) {
1003 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1004 goto err_out;
1005 }
1006 if (new) {
1007 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1008 goto err_out;
1009 }
1010
1011 if (skb->len)
Denis V. Lunev97c53ca2007-11-19 22:26:51 -08001012 return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001013
1014err_out:
1015 kfree_skb(skb);
1016 return -EINVAL;
1017}
1018
1019static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1020{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001021 struct net *net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001022 int idx, q_idx;
1023 int s_idx, s_q_idx;
1024 struct net_device *dev;
1025 struct Qdisc *q;
1026
Denis V. Lunevb8542722007-12-01 00:21:31 +11001027 if (net != &init_net)
1028 return 0;
1029
Linus Torvalds1da177e2005-04-16 15:20:36 -07001030 s_idx = cb->args[0];
1031 s_q_idx = q_idx = cb->args[1];
1032 read_lock(&dev_base_lock);
Pavel Emelianov7562f872007-05-03 15:13:45 -07001033 idx = 0;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001034 for_each_netdev(&init_net, dev) {
David S. Millerb0e1e642008-07-08 17:42:10 -07001035 struct netdev_queue *dev_queue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001036 if (idx < s_idx)
Pavel Emelianov7562f872007-05-03 15:13:45 -07001037 goto cont;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001038 if (idx > s_idx)
1039 s_q_idx = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001040 q_idx = 0;
David S. Millere8a04642008-07-17 00:34:19 -07001041 dev_queue = netdev_get_tx_queue(dev, 0);
David S. Millerb0e1e642008-07-08 17:42:10 -07001042 list_for_each_entry(q, &dev_queue->qdisc_list, list) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001043 if (q_idx < s_q_idx) {
1044 q_idx++;
1045 continue;
1046 }
1047 if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
Patrick McHardy0463d4a2007-04-16 17:02:10 -07001048 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001049 goto done;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001050 q_idx++;
1051 }
Pavel Emelianov7562f872007-05-03 15:13:45 -07001052cont:
1053 idx++;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001054 }
1055
1056done:
1057 read_unlock(&dev_base_lock);
1058
1059 cb->args[0] = idx;
1060 cb->args[1] = q_idx;
1061
1062 return skb->len;
1063}
1064
1065
1066
1067/************************************************
1068 * Traffic classes manipulation. *
1069 ************************************************/
1070
1071
1072
1073static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1074{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001075 struct net *net = sock_net(skb->sk);
David S. Millerb0e1e642008-07-08 17:42:10 -07001076 struct netdev_queue *dev_queue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001077 struct tcmsg *tcm = NLMSG_DATA(n);
Patrick McHardy1e904742008-01-22 22:11:17 -08001078 struct nlattr *tca[TCA_MAX + 1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07001079 struct net_device *dev;
1080 struct Qdisc *q = NULL;
Eric Dumazet20fea082007-11-14 01:44:41 -08001081 const struct Qdisc_class_ops *cops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001082 unsigned long cl = 0;
1083 unsigned long new_cl;
1084 u32 pid = tcm->tcm_parent;
1085 u32 clid = tcm->tcm_handle;
1086 u32 qid = TC_H_MAJ(clid);
1087 int err;
1088
Denis V. Lunevb8542722007-12-01 00:21:31 +11001089 if (net != &init_net)
1090 return -EINVAL;
1091
Eric W. Biederman881d9662007-09-17 11:56:21 -07001092 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001093 return -ENODEV;
1094
Patrick McHardy1e904742008-01-22 22:11:17 -08001095 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1096 if (err < 0)
1097 return err;
1098
Linus Torvalds1da177e2005-04-16 15:20:36 -07001099 /*
1100 parent == TC_H_UNSPEC - unspecified parent.
1101 parent == TC_H_ROOT - class is root, which has no parent.
1102 parent == X:0 - parent is root class.
1103 parent == X:Y - parent is a node in hierarchy.
1104 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
1105
1106 handle == 0:0 - generate handle from kernel pool.
1107 handle == 0:Y - class is X:Y, where X:0 is qdisc.
1108 handle == X:Y - clear.
1109 handle == X:0 - root class.
1110 */
1111
1112 /* Step 1. Determine qdisc handle X:0 */
1113
David S. Millere8a04642008-07-17 00:34:19 -07001114 dev_queue = netdev_get_tx_queue(dev, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001115 if (pid != TC_H_ROOT) {
1116 u32 qid1 = TC_H_MAJ(pid);
1117
1118 if (qid && qid1) {
1119 /* If both majors are known, they must be identical. */
1120 if (qid != qid1)
1121 return -EINVAL;
1122 } else if (qid1) {
1123 qid = qid1;
1124 } else if (qid == 0)
David S. Millerb0e1e642008-07-08 17:42:10 -07001125 qid = dev_queue->qdisc_sleeping->handle;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001126
1127 /* Now qid is genuine qdisc handle consistent
1128 both with parent and child.
1129
1130 TC_H_MAJ(pid) still may be unspecified, complete it now.
1131 */
1132 if (pid)
1133 pid = TC_H_MAKE(qid, pid);
1134 } else {
1135 if (qid == 0)
David S. Millerb0e1e642008-07-08 17:42:10 -07001136 qid = dev_queue->qdisc_sleeping->handle;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001137 }
1138
1139 /* OK. Locate qdisc */
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +09001140 if ((q = qdisc_lookup(dev, qid)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001141 return -ENOENT;
1142
1143 /* An check that it supports classes */
1144 cops = q->ops->cl_ops;
1145 if (cops == NULL)
1146 return -EINVAL;
1147
1148 /* Now try to get class */
1149 if (clid == 0) {
1150 if (pid == TC_H_ROOT)
1151 clid = qid;
1152 } else
1153 clid = TC_H_MAKE(qid, clid);
1154
1155 if (clid)
1156 cl = cops->get(q, clid);
1157
1158 if (cl == 0) {
1159 err = -ENOENT;
1160 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1161 goto out;
1162 } else {
1163 switch (n->nlmsg_type) {
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +09001164 case RTM_NEWTCLASS:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001165 err = -EEXIST;
1166 if (n->nlmsg_flags&NLM_F_EXCL)
1167 goto out;
1168 break;
1169 case RTM_DELTCLASS:
1170 err = cops->delete(q, cl);
1171 if (err == 0)
1172 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1173 goto out;
1174 case RTM_GETTCLASS:
1175 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1176 goto out;
1177 default:
1178 err = -EINVAL;
1179 goto out;
1180 }
1181 }
1182
1183 new_cl = cl;
1184 err = cops->change(q, clid, pid, tca, &new_cl);
1185 if (err == 0)
1186 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1187
1188out:
1189 if (cl)
1190 cops->put(q, cl);
1191
1192 return err;
1193}
1194
1195
1196static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1197 unsigned long cl,
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -07001198 u32 pid, u32 seq, u16 flags, int event)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001199{
1200 struct tcmsg *tcm;
1201 struct nlmsghdr *nlh;
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001202 unsigned char *b = skb_tail_pointer(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001203 struct gnet_dump d;
Eric Dumazet20fea082007-11-14 01:44:41 -08001204 const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001205
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -07001206 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001207 tcm = NLMSG_DATA(nlh);
1208 tcm->tcm_family = AF_UNSPEC;
David S. Miller5ce2d482008-07-08 17:06:30 -07001209 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001210 tcm->tcm_parent = q->handle;
1211 tcm->tcm_handle = q->handle;
1212 tcm->tcm_info = 0;
Patrick McHardy57e1c482008-01-23 20:34:28 -08001213 NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001214 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
Patrick McHardy1e904742008-01-22 22:11:17 -08001215 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001216
1217 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
David S. Miller7698b4f2008-07-16 01:42:40 -07001218 TCA_XSTATS, qdisc_root_lock(q), &d) < 0)
Patrick McHardy1e904742008-01-22 22:11:17 -08001219 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001220
1221 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
Patrick McHardy1e904742008-01-22 22:11:17 -08001222 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001223
1224 if (gnet_stats_finish_copy(&d) < 0)
Patrick McHardy1e904742008-01-22 22:11:17 -08001225 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001226
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001227 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001228 return skb->len;
1229
1230nlmsg_failure:
Patrick McHardy1e904742008-01-22 22:11:17 -08001231nla_put_failure:
Arnaldo Carvalho de Melodc5fc572007-03-25 23:06:12 -07001232 nlmsg_trim(skb, b);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001233 return -1;
1234}
1235
1236static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1237 struct Qdisc *q, unsigned long cl, int event)
1238{
1239 struct sk_buff *skb;
1240 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1241
1242 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1243 if (!skb)
1244 return -ENOBUFS;
1245
1246 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1247 kfree_skb(skb);
1248 return -EINVAL;
1249 }
1250
Denis V. Lunev97c53ca2007-11-19 22:26:51 -08001251 return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001252}
1253
1254struct qdisc_dump_args
1255{
1256 struct qdisc_walker w;
1257 struct sk_buff *skb;
1258 struct netlink_callback *cb;
1259};
1260
1261static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1262{
1263 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1264
1265 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1266 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1267}
1268
1269static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1270{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001271 struct net *net = sock_net(skb->sk);
David S. Millerb0e1e642008-07-08 17:42:10 -07001272 struct netdev_queue *dev_queue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001273 int t;
1274 int s_t;
1275 struct net_device *dev;
1276 struct Qdisc *q;
1277 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1278 struct qdisc_dump_args arg;
1279
Denis V. Lunevb8542722007-12-01 00:21:31 +11001280 if (net != &init_net)
1281 return 0;
1282
Linus Torvalds1da177e2005-04-16 15:20:36 -07001283 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1284 return 0;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001285 if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001286 return 0;
1287
1288 s_t = cb->args[0];
1289 t = 0;
1290
David S. Millere8a04642008-07-17 00:34:19 -07001291 dev_queue = netdev_get_tx_queue(dev, 0);
David S. Millerb0e1e642008-07-08 17:42:10 -07001292 list_for_each_entry(q, &dev_queue->qdisc_list, list) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001293 if (t < s_t || !q->ops->cl_ops ||
1294 (tcm->tcm_parent &&
1295 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1296 t++;
1297 continue;
1298 }
1299 if (t > s_t)
1300 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1301 arg.w.fn = qdisc_class_dump;
1302 arg.skb = skb;
1303 arg.cb = cb;
1304 arg.w.stop = 0;
1305 arg.w.skip = cb->args[1];
1306 arg.w.count = 0;
1307 q->ops->cl_ops->walk(q, &arg.w);
1308 cb->args[1] = arg.w.count;
1309 if (arg.w.stop)
1310 break;
1311 t++;
1312 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001313
1314 cb->args[0] = t;
1315
1316 dev_put(dev);
1317 return skb->len;
1318}
1319
1320/* Main classifier routine: scans classifier chain attached
1321 to this qdisc, (optionally) tests for protocol and asks
1322 specific classifiers.
1323 */
Patrick McHardy73ca4912007-07-15 00:02:31 -07001324int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1325 struct tcf_result *res)
1326{
1327 __be16 protocol = skb->protocol;
1328 int err = 0;
1329
1330 for (; tp; tp = tp->next) {
1331 if ((tp->protocol == protocol ||
1332 tp->protocol == htons(ETH_P_ALL)) &&
1333 (err = tp->classify(skb, tp, res)) >= 0) {
1334#ifdef CONFIG_NET_CLS_ACT
1335 if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1336 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1337#endif
1338 return err;
1339 }
1340 }
1341 return -1;
1342}
1343EXPORT_SYMBOL(tc_classify_compat);
1344
Linus Torvalds1da177e2005-04-16 15:20:36 -07001345int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
Patrick McHardy73ca4912007-07-15 00:02:31 -07001346 struct tcf_result *res)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001347{
1348 int err = 0;
Patrick McHardy73ca4912007-07-15 00:02:31 -07001349 __be16 protocol;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001350#ifdef CONFIG_NET_CLS_ACT
1351 struct tcf_proto *otp = tp;
1352reclassify:
1353#endif
1354 protocol = skb->protocol;
1355
Patrick McHardy73ca4912007-07-15 00:02:31 -07001356 err = tc_classify_compat(skb, tp, res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001357#ifdef CONFIG_NET_CLS_ACT
Patrick McHardy73ca4912007-07-15 00:02:31 -07001358 if (err == TC_ACT_RECLASSIFY) {
1359 u32 verd = G_TC_VERD(skb->tc_verd);
1360 tp = otp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001361
Patrick McHardy73ca4912007-07-15 00:02:31 -07001362 if (verd++ >= MAX_REC_LOOP) {
1363 printk("rule prio %u protocol %02x reclassify loop, "
1364 "packet dropped\n",
1365 tp->prio&0xffff, ntohs(tp->protocol));
1366 return TC_ACT_SHOT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001367 }
Patrick McHardy73ca4912007-07-15 00:02:31 -07001368 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1369 goto reclassify;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001370 }
Patrick McHardy73ca4912007-07-15 00:02:31 -07001371#endif
1372 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001373}
Patrick McHardy73ca4912007-07-15 00:02:31 -07001374EXPORT_SYMBOL(tc_classify);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001375
Patrick McHardya48b5a62007-03-23 11:29:43 -07001376void tcf_destroy(struct tcf_proto *tp)
1377{
1378 tp->ops->destroy(tp);
1379 module_put(tp->ops->owner);
1380 kfree(tp);
1381}
1382
Patrick McHardyff31ab52008-07-01 19:52:38 -07001383void tcf_destroy_chain(struct tcf_proto **fl)
Patrick McHardya48b5a62007-03-23 11:29:43 -07001384{
1385 struct tcf_proto *tp;
1386
Patrick McHardyff31ab52008-07-01 19:52:38 -07001387 while ((tp = *fl) != NULL) {
1388 *fl = tp->next;
Patrick McHardya48b5a62007-03-23 11:29:43 -07001389 tcf_destroy(tp);
1390 }
1391}
1392EXPORT_SYMBOL(tcf_destroy_chain);
1393
Linus Torvalds1da177e2005-04-16 15:20:36 -07001394#ifdef CONFIG_PROC_FS
1395static int psched_show(struct seq_file *seq, void *v)
1396{
Patrick McHardy3c0cfc12007-10-10 16:32:41 -07001397 struct timespec ts;
1398
1399 hrtimer_get_res(CLOCK_MONOTONIC, &ts);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001400 seq_printf(seq, "%08x %08x %08x %08x\n",
Patrick McHardy641b9e02007-03-16 01:18:42 -07001401 (u32)NSEC_PER_USEC, (u32)PSCHED_US2NS(1),
Patrick McHardy514bca32007-03-16 12:34:52 -07001402 1000000,
Patrick McHardy3c0cfc12007-10-10 16:32:41 -07001403 (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001404
1405 return 0;
1406}
1407
1408static int psched_open(struct inode *inode, struct file *file)
1409{
1410 return single_open(file, psched_show, PDE(inode)->data);
1411}
1412
Arjan van de Venda7071d2007-02-12 00:55:36 -08001413static const struct file_operations psched_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001414 .owner = THIS_MODULE,
1415 .open = psched_open,
1416 .read = seq_read,
1417 .llseek = seq_lseek,
1418 .release = single_release,
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +09001419};
Linus Torvalds1da177e2005-04-16 15:20:36 -07001420#endif
1421
Linus Torvalds1da177e2005-04-16 15:20:36 -07001422static int __init pktsched_init(void)
1423{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001424 register_qdisc(&pfifo_qdisc_ops);
1425 register_qdisc(&bfifo_qdisc_ops);
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02001426 proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001427
Thomas Grafbe577dd2007-03-22 11:55:50 -07001428 rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1429 rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1430 rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1431 rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1432 rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1433 rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1434
Linus Torvalds1da177e2005-04-16 15:20:36 -07001435 return 0;
1436}
1437
1438subsys_initcall(pktsched_init);