xref: /linux/net/sched/sch_api.c (revision c537b994505099b7197e7d3125b942ecbcc51eb6)
1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/init.h>
32 #include <linux/proc_fs.h>
33 #include <linux/seq_file.h>
34 #include <linux/kmod.h>
35 #include <linux/list.h>
36 #include <linux/bitops.h>
37 
38 #include <net/sock.h>
39 #include <net/pkt_sched.h>
40 
41 #include <asm/processor.h>
42 #include <asm/uaccess.h>
43 #include <asm/system.h>
44 
45 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
46 			struct Qdisc *old, struct Qdisc *new);
47 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
48 			 struct Qdisc *q, unsigned long cl, int event);
49 
50 /*
51 
52    Short review.
53    -------------
54 
55    This file consists of two interrelated parts:
56 
57    1. queueing disciplines manager frontend.
58    2. traffic classes manager frontend.
59 
60    Generally, queueing discipline ("qdisc") is a black box,
61    which is able to enqueue packets and to dequeue them (when
62    device is ready to send something) in order and at times
63    determined by algorithm hidden in it.
64 
65    qdisc's are divided to two categories:
66    - "queues", which have no internal structure visible from outside.
67    - "schedulers", which split all the packets to "traffic classes",
68      using "packet classifiers" (look at cls_api.c)
69 
70    In turn, classes may have child qdiscs (as rule, queues)
71    attached to them etc. etc. etc.
72 
73    The goal of the routines in this file is to translate
74    information supplied by user in the form of handles
75    to more intelligible for kernel form, to make some sanity
76    checks and part of work, which is common to all qdiscs
77    and to provide rtnetlink notifications.
78 
79    All real intelligent work is done inside qdisc modules.
80 
81 
82 
83    Every discipline has two major routines: enqueue and dequeue.
84 
85    ---dequeue
86 
87    dequeue usually returns a skb to send. It is allowed to return NULL,
88    but it does not mean that queue is empty, it just means that
89    discipline does not want to send anything this time.
90    Queue is really empty if q->q.qlen == 0.
91    For complicated disciplines with multiple queues q->q is not
92    real packet queue, but however q->q.qlen must be valid.
93 
94    ---enqueue
95 
96    enqueue returns 0, if packet was enqueued successfully.
97    If packet (this one or another one) was dropped, it returns
98    not zero error code.
99    NET_XMIT_DROP 	- this packet dropped
100      Expected action: do not backoff, but wait until queue will clear.
101    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
102      Expected action: backoff or ignore
103    NET_XMIT_POLICED	- dropped by police.
104      Expected action: backoff or error to real-time apps.
105 
106    Auxiliary routines:
107 
108    ---requeue
109 
110    requeues once dequeued packet. It is used for non-standard or
111    just buggy devices, which can defer output even if dev->tbusy=0.
112 
113    ---reset
114 
115    returns qdisc to initial state: purge all buffers, clear all
116    timers, counters (except for statistics) etc.
117 
118    ---init
119 
120    initializes newly created qdisc.
121 
122    ---destroy
123 
124    destroys resources allocated by init and during lifetime of qdisc.
125 
126    ---change
127 
128    changes qdisc parameters.
129  */
130 
131 /* Protects list of registered TC modules. It is pure SMP lock. */
132 static DEFINE_RWLOCK(qdisc_mod_lock);
133 
134 
135 /************************************************
136  *	Queueing disciplines manipulation.	*
137  ************************************************/
138 
139 
140 /* The list of all installed queueing disciplines. */
141 
142 static struct Qdisc_ops *qdisc_base;
143 
144 /* Register/uregister queueing discipline */
145 
146 int register_qdisc(struct Qdisc_ops *qops)
147 {
148 	struct Qdisc_ops *q, **qp;
149 	int rc = -EEXIST;
150 
151 	write_lock(&qdisc_mod_lock);
152 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
153 		if (!strcmp(qops->id, q->id))
154 			goto out;
155 
156 	if (qops->enqueue == NULL)
157 		qops->enqueue = noop_qdisc_ops.enqueue;
158 	if (qops->requeue == NULL)
159 		qops->requeue = noop_qdisc_ops.requeue;
160 	if (qops->dequeue == NULL)
161 		qops->dequeue = noop_qdisc_ops.dequeue;
162 
163 	qops->next = NULL;
164 	*qp = qops;
165 	rc = 0;
166 out:
167 	write_unlock(&qdisc_mod_lock);
168 	return rc;
169 }
170 
171 int unregister_qdisc(struct Qdisc_ops *qops)
172 {
173 	struct Qdisc_ops *q, **qp;
174 	int err = -ENOENT;
175 
176 	write_lock(&qdisc_mod_lock);
177 	for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
178 		if (q == qops)
179 			break;
180 	if (q) {
181 		*qp = q->next;
182 		q->next = NULL;
183 		err = 0;
184 	}
185 	write_unlock(&qdisc_mod_lock);
186 	return err;
187 }
188 
189 /* We know handle. Find qdisc among all qdisc's attached to device
190    (root qdisc, all its children, children of children etc.)
191  */
192 
193 static struct Qdisc *__qdisc_lookup(struct net_device *dev, u32 handle)
194 {
195 	struct Qdisc *q;
196 
197 	list_for_each_entry(q, &dev->qdisc_list, list) {
198 		if (q->handle == handle)
199 			return q;
200 	}
201 	return NULL;
202 }
203 
204 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
205 {
206 	struct Qdisc *q;
207 
208 	read_lock(&qdisc_tree_lock);
209 	q = __qdisc_lookup(dev, handle);
210 	read_unlock(&qdisc_tree_lock);
211 	return q;
212 }
213 
214 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
215 {
216 	unsigned long cl;
217 	struct Qdisc *leaf;
218 	struct Qdisc_class_ops *cops = p->ops->cl_ops;
219 
220 	if (cops == NULL)
221 		return NULL;
222 	cl = cops->get(p, classid);
223 
224 	if (cl == 0)
225 		return NULL;
226 	leaf = cops->leaf(p, cl);
227 	cops->put(p, cl);
228 	return leaf;
229 }
230 
231 /* Find queueing discipline by name */
232 
233 static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
234 {
235 	struct Qdisc_ops *q = NULL;
236 
237 	if (kind) {
238 		read_lock(&qdisc_mod_lock);
239 		for (q = qdisc_base; q; q = q->next) {
240 			if (rtattr_strcmp(kind, q->id) == 0) {
241 				if (!try_module_get(q->owner))
242 					q = NULL;
243 				break;
244 			}
245 		}
246 		read_unlock(&qdisc_mod_lock);
247 	}
248 	return q;
249 }
250 
251 static struct qdisc_rate_table *qdisc_rtab_list;
252 
253 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
254 {
255 	struct qdisc_rate_table *rtab;
256 
257 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
258 		if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
259 			rtab->refcnt++;
260 			return rtab;
261 		}
262 	}
263 
264 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
265 		return NULL;
266 
267 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
268 	if (rtab) {
269 		rtab->rate = *r;
270 		rtab->refcnt = 1;
271 		memcpy(rtab->data, RTA_DATA(tab), 1024);
272 		rtab->next = qdisc_rtab_list;
273 		qdisc_rtab_list = rtab;
274 	}
275 	return rtab;
276 }
277 
278 void qdisc_put_rtab(struct qdisc_rate_table *tab)
279 {
280 	struct qdisc_rate_table *rtab, **rtabp;
281 
282 	if (!tab || --tab->refcnt)
283 		return;
284 
285 	for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
286 		if (rtab == tab) {
287 			*rtabp = rtab->next;
288 			kfree(rtab);
289 			return;
290 		}
291 	}
292 }
293 
294 
295 /* Allocate an unique handle from space managed by kernel */
296 
297 static u32 qdisc_alloc_handle(struct net_device *dev)
298 {
299 	int i = 0x10000;
300 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
301 
302 	do {
303 		autohandle += TC_H_MAKE(0x10000U, 0);
304 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
305 			autohandle = TC_H_MAKE(0x80000000U, 0);
306 	} while	(qdisc_lookup(dev, autohandle) && --i > 0);
307 
308 	return i>0 ? autohandle : 0;
309 }
310 
311 /* Attach toplevel qdisc to device dev */
312 
313 static struct Qdisc *
314 dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
315 {
316 	struct Qdisc *oqdisc;
317 
318 	if (dev->flags & IFF_UP)
319 		dev_deactivate(dev);
320 
321 	qdisc_lock_tree(dev);
322 	if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
323 		oqdisc = dev->qdisc_ingress;
324 		/* Prune old scheduler */
325 		if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
326 			/* delete */
327 			qdisc_reset(oqdisc);
328 			dev->qdisc_ingress = NULL;
329 		} else {  /* new */
330 			dev->qdisc_ingress = qdisc;
331 		}
332 
333 	} else {
334 
335 		oqdisc = dev->qdisc_sleeping;
336 
337 		/* Prune old scheduler */
338 		if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
339 			qdisc_reset(oqdisc);
340 
341 		/* ... and graft new one */
342 		if (qdisc == NULL)
343 			qdisc = &noop_qdisc;
344 		dev->qdisc_sleeping = qdisc;
345 		dev->qdisc = &noop_qdisc;
346 	}
347 
348 	qdisc_unlock_tree(dev);
349 
350 	if (dev->flags & IFF_UP)
351 		dev_activate(dev);
352 
353 	return oqdisc;
354 }
355 
356 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
357 {
358 	struct Qdisc_class_ops *cops;
359 	unsigned long cl;
360 	u32 parentid;
361 
362 	if (n == 0)
363 		return;
364 	while ((parentid = sch->parent)) {
365 		sch = __qdisc_lookup(sch->dev, TC_H_MAJ(parentid));
366 		cops = sch->ops->cl_ops;
367 		if (cops->qlen_notify) {
368 			cl = cops->get(sch, parentid);
369 			cops->qlen_notify(sch, cl);
370 			cops->put(sch, cl);
371 		}
372 		sch->q.qlen -= n;
373 	}
374 }
375 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
376 
377 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
378    to device "dev".
379 
380    Old qdisc is not destroyed but returned in *old.
381  */
382 
383 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
384 		       u32 classid,
385 		       struct Qdisc *new, struct Qdisc **old)
386 {
387 	int err = 0;
388 	struct Qdisc *q = *old;
389 
390 
391 	if (parent == NULL) {
392 		if (q && q->flags&TCQ_F_INGRESS) {
393 			*old = dev_graft_qdisc(dev, q);
394 		} else {
395 			*old = dev_graft_qdisc(dev, new);
396 		}
397 	} else {
398 		struct Qdisc_class_ops *cops = parent->ops->cl_ops;
399 
400 		err = -EINVAL;
401 
402 		if (cops) {
403 			unsigned long cl = cops->get(parent, classid);
404 			if (cl) {
405 				err = cops->graft(parent, cl, new, old);
406 				if (new)
407 					new->parent = classid;
408 				cops->put(parent, cl);
409 			}
410 		}
411 	}
412 	return err;
413 }
414 
415 /*
416    Allocate and initialize new qdisc.
417 
418    Parameters are passed via opt.
419  */
420 
421 static struct Qdisc *
422 qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
423 {
424 	int err;
425 	struct rtattr *kind = tca[TCA_KIND-1];
426 	struct Qdisc *sch;
427 	struct Qdisc_ops *ops;
428 
429 	ops = qdisc_lookup_ops(kind);
430 #ifdef CONFIG_KMOD
431 	if (ops == NULL && kind != NULL) {
432 		char name[IFNAMSIZ];
433 		if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
434 			/* We dropped the RTNL semaphore in order to
435 			 * perform the module load.  So, even if we
436 			 * succeeded in loading the module we have to
437 			 * tell the caller to replay the request.  We
438 			 * indicate this using -EAGAIN.
439 			 * We replay the request because the device may
440 			 * go away in the mean time.
441 			 */
442 			rtnl_unlock();
443 			request_module("sch_%s", name);
444 			rtnl_lock();
445 			ops = qdisc_lookup_ops(kind);
446 			if (ops != NULL) {
447 				/* We will try again qdisc_lookup_ops,
448 				 * so don't keep a reference.
449 				 */
450 				module_put(ops->owner);
451 				err = -EAGAIN;
452 				goto err_out;
453 			}
454 		}
455 	}
456 #endif
457 
458 	err = -ENOENT;
459 	if (ops == NULL)
460 		goto err_out;
461 
462 	sch = qdisc_alloc(dev, ops);
463 	if (IS_ERR(sch)) {
464 		err = PTR_ERR(sch);
465 		goto err_out2;
466 	}
467 
468 	if (handle == TC_H_INGRESS) {
469 		sch->flags |= TCQ_F_INGRESS;
470 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
471 	} else if (handle == 0) {
472 		handle = qdisc_alloc_handle(dev);
473 		err = -ENOMEM;
474 		if (handle == 0)
475 			goto err_out3;
476 	}
477 
478 	sch->handle = handle;
479 
480 	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
481 #ifdef CONFIG_NET_ESTIMATOR
482 		if (tca[TCA_RATE-1]) {
483 			err = gen_new_estimator(&sch->bstats, &sch->rate_est,
484 						sch->stats_lock,
485 						tca[TCA_RATE-1]);
486 			if (err) {
487 				/*
488 				 * Any broken qdiscs that would require
489 				 * a ops->reset() here? The qdisc was never
490 				 * in action so it shouldn't be necessary.
491 				 */
492 				if (ops->destroy)
493 					ops->destroy(sch);
494 				goto err_out3;
495 			}
496 		}
497 #endif
498 		qdisc_lock_tree(dev);
499 		list_add_tail(&sch->list, &dev->qdisc_list);
500 		qdisc_unlock_tree(dev);
501 
502 		return sch;
503 	}
504 err_out3:
505 	dev_put(dev);
506 	kfree((char *) sch - sch->padded);
507 err_out2:
508 	module_put(ops->owner);
509 err_out:
510 	*errp = err;
511 	return NULL;
512 }
513 
514 static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
515 {
516 	if (tca[TCA_OPTIONS-1]) {
517 		int err;
518 
519 		if (sch->ops->change == NULL)
520 			return -EINVAL;
521 		err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
522 		if (err)
523 			return err;
524 	}
525 #ifdef CONFIG_NET_ESTIMATOR
526 	if (tca[TCA_RATE-1])
527 		gen_replace_estimator(&sch->bstats, &sch->rate_est,
528 			sch->stats_lock, tca[TCA_RATE-1]);
529 #endif
530 	return 0;
531 }
532 
533 struct check_loop_arg
534 {
535 	struct qdisc_walker 	w;
536 	struct Qdisc		*p;
537 	int			depth;
538 };
539 
540 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
541 
542 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
543 {
544 	struct check_loop_arg	arg;
545 
546 	if (q->ops->cl_ops == NULL)
547 		return 0;
548 
549 	arg.w.stop = arg.w.skip = arg.w.count = 0;
550 	arg.w.fn = check_loop_fn;
551 	arg.depth = depth;
552 	arg.p = p;
553 	q->ops->cl_ops->walk(q, &arg.w);
554 	return arg.w.stop ? -ELOOP : 0;
555 }
556 
557 static int
558 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
559 {
560 	struct Qdisc *leaf;
561 	struct Qdisc_class_ops *cops = q->ops->cl_ops;
562 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
563 
564 	leaf = cops->leaf(q, cl);
565 	if (leaf) {
566 		if (leaf == arg->p || arg->depth > 7)
567 			return -ELOOP;
568 		return check_loop(leaf, arg->p, arg->depth + 1);
569 	}
570 	return 0;
571 }
572 
573 /*
574  * Delete/get qdisc.
575  */
576 
577 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
578 {
579 	struct tcmsg *tcm = NLMSG_DATA(n);
580 	struct rtattr **tca = arg;
581 	struct net_device *dev;
582 	u32 clid = tcm->tcm_parent;
583 	struct Qdisc *q = NULL;
584 	struct Qdisc *p = NULL;
585 	int err;
586 
587 	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
588 		return -ENODEV;
589 
590 	if (clid) {
591 		if (clid != TC_H_ROOT) {
592 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
593 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
594 					return -ENOENT;
595 				q = qdisc_leaf(p, clid);
596 			} else { /* ingress */
597 				q = dev->qdisc_ingress;
598 			}
599 		} else {
600 			q = dev->qdisc_sleeping;
601 		}
602 		if (!q)
603 			return -ENOENT;
604 
605 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
606 			return -EINVAL;
607 	} else {
608 		if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
609 			return -ENOENT;
610 	}
611 
612 	if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
613 		return -EINVAL;
614 
615 	if (n->nlmsg_type == RTM_DELQDISC) {
616 		if (!clid)
617 			return -EINVAL;
618 		if (q->handle == 0)
619 			return -ENOENT;
620 		if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
621 			return err;
622 		if (q) {
623 			qdisc_notify(skb, n, clid, q, NULL);
624 			spin_lock_bh(&dev->queue_lock);
625 			qdisc_destroy(q);
626 			spin_unlock_bh(&dev->queue_lock);
627 		}
628 	} else {
629 		qdisc_notify(skb, n, clid, NULL, q);
630 	}
631 	return 0;
632 }
633 
634 /*
635    Create/change qdisc.
636  */
637 
638 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
639 {
640 	struct tcmsg *tcm;
641 	struct rtattr **tca;
642 	struct net_device *dev;
643 	u32 clid;
644 	struct Qdisc *q, *p;
645 	int err;
646 
647 replay:
648 	/* Reinit, just in case something touches this. */
649 	tcm = NLMSG_DATA(n);
650 	tca = arg;
651 	clid = tcm->tcm_parent;
652 	q = p = NULL;
653 
654 	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
655 		return -ENODEV;
656 
657 	if (clid) {
658 		if (clid != TC_H_ROOT) {
659 			if (clid != TC_H_INGRESS) {
660 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
661 					return -ENOENT;
662 				q = qdisc_leaf(p, clid);
663 			} else { /*ingress */
664 				q = dev->qdisc_ingress;
665 			}
666 		} else {
667 			q = dev->qdisc_sleeping;
668 		}
669 
670 		/* It may be default qdisc, ignore it */
671 		if (q && q->handle == 0)
672 			q = NULL;
673 
674 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
675 			if (tcm->tcm_handle) {
676 				if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
677 					return -EEXIST;
678 				if (TC_H_MIN(tcm->tcm_handle))
679 					return -EINVAL;
680 				if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
681 					goto create_n_graft;
682 				if (n->nlmsg_flags&NLM_F_EXCL)
683 					return -EEXIST;
684 				if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
685 					return -EINVAL;
686 				if (q == p ||
687 				    (p && check_loop(q, p, 0)))
688 					return -ELOOP;
689 				atomic_inc(&q->refcnt);
690 				goto graft;
691 			} else {
692 				if (q == NULL)
693 					goto create_n_graft;
694 
695 				/* This magic test requires explanation.
696 				 *
697 				 *   We know, that some child q is already
698 				 *   attached to this parent and have choice:
699 				 *   either to change it or to create/graft new one.
700 				 *
701 				 *   1. We are allowed to create/graft only
702 				 *   if CREATE and REPLACE flags are set.
703 				 *
704 				 *   2. If EXCL is set, requestor wanted to say,
705 				 *   that qdisc tcm_handle is not expected
706 				 *   to exist, so that we choose create/graft too.
707 				 *
708 				 *   3. The last case is when no flags are set.
709 				 *   Alas, it is sort of hole in API, we
710 				 *   cannot decide what to do unambiguously.
711 				 *   For now we select create/graft, if
712 				 *   user gave KIND, which does not match existing.
713 				 */
714 				if ((n->nlmsg_flags&NLM_F_CREATE) &&
715 				    (n->nlmsg_flags&NLM_F_REPLACE) &&
716 				    ((n->nlmsg_flags&NLM_F_EXCL) ||
717 				     (tca[TCA_KIND-1] &&
718 				      rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
719 					goto create_n_graft;
720 			}
721 		}
722 	} else {
723 		if (!tcm->tcm_handle)
724 			return -EINVAL;
725 		q = qdisc_lookup(dev, tcm->tcm_handle);
726 	}
727 
728 	/* Change qdisc parameters */
729 	if (q == NULL)
730 		return -ENOENT;
731 	if (n->nlmsg_flags&NLM_F_EXCL)
732 		return -EEXIST;
733 	if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
734 		return -EINVAL;
735 	err = qdisc_change(q, tca);
736 	if (err == 0)
737 		qdisc_notify(skb, n, clid, NULL, q);
738 	return err;
739 
740 create_n_graft:
741 	if (!(n->nlmsg_flags&NLM_F_CREATE))
742 		return -ENOENT;
743 	if (clid == TC_H_INGRESS)
744 		q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
745 	else
746 		q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
747 	if (q == NULL) {
748 		if (err == -EAGAIN)
749 			goto replay;
750 		return err;
751 	}
752 
753 graft:
754 	if (1) {
755 		struct Qdisc *old_q = NULL;
756 		err = qdisc_graft(dev, p, clid, q, &old_q);
757 		if (err) {
758 			if (q) {
759 				spin_lock_bh(&dev->queue_lock);
760 				qdisc_destroy(q);
761 				spin_unlock_bh(&dev->queue_lock);
762 			}
763 			return err;
764 		}
765 		qdisc_notify(skb, n, clid, old_q, q);
766 		if (old_q) {
767 			spin_lock_bh(&dev->queue_lock);
768 			qdisc_destroy(old_q);
769 			spin_unlock_bh(&dev->queue_lock);
770 		}
771 	}
772 	return 0;
773 }
774 
775 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
776 			 u32 pid, u32 seq, u16 flags, int event)
777 {
778 	struct tcmsg *tcm;
779 	struct nlmsghdr  *nlh;
780 	unsigned char	 *b = skb->tail;
781 	struct gnet_dump d;
782 
783 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
784 	tcm = NLMSG_DATA(nlh);
785 	tcm->tcm_family = AF_UNSPEC;
786 	tcm->tcm__pad1 = 0;
787 	tcm->tcm__pad2 = 0;
788 	tcm->tcm_ifindex = q->dev->ifindex;
789 	tcm->tcm_parent = clid;
790 	tcm->tcm_handle = q->handle;
791 	tcm->tcm_info = atomic_read(&q->refcnt);
792 	RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
793 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
794 		goto rtattr_failure;
795 	q->qstats.qlen = q->q.qlen;
796 
797 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
798 			TCA_XSTATS, q->stats_lock, &d) < 0)
799 		goto rtattr_failure;
800 
801 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
802 		goto rtattr_failure;
803 
804 	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
805 #ifdef CONFIG_NET_ESTIMATOR
806 	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
807 #endif
808 	    gnet_stats_copy_queue(&d, &q->qstats) < 0)
809 		goto rtattr_failure;
810 
811 	if (gnet_stats_finish_copy(&d) < 0)
812 		goto rtattr_failure;
813 
814 	nlh->nlmsg_len = skb->tail - b;
815 	return skb->len;
816 
817 nlmsg_failure:
818 rtattr_failure:
819 	skb_trim(skb, b - skb->data);
820 	return -1;
821 }
822 
823 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
824 			u32 clid, struct Qdisc *old, struct Qdisc *new)
825 {
826 	struct sk_buff *skb;
827 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
828 
829 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
830 	if (!skb)
831 		return -ENOBUFS;
832 
833 	if (old && old->handle) {
834 		if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
835 			goto err_out;
836 	}
837 	if (new) {
838 		if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
839 			goto err_out;
840 	}
841 
842 	if (skb->len)
843 		return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
844 
845 err_out:
846 	kfree_skb(skb);
847 	return -EINVAL;
848 }
849 
850 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
851 {
852 	int idx, q_idx;
853 	int s_idx, s_q_idx;
854 	struct net_device *dev;
855 	struct Qdisc *q;
856 
857 	s_idx = cb->args[0];
858 	s_q_idx = q_idx = cb->args[1];
859 	read_lock(&dev_base_lock);
860 	for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
861 		if (idx < s_idx)
862 			continue;
863 		if (idx > s_idx)
864 			s_q_idx = 0;
865 		read_lock(&qdisc_tree_lock);
866 		q_idx = 0;
867 		list_for_each_entry(q, &dev->qdisc_list, list) {
868 			if (q_idx < s_q_idx) {
869 				q_idx++;
870 				continue;
871 			}
872 			if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
873 					  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
874 				read_unlock(&qdisc_tree_lock);
875 				goto done;
876 			}
877 			q_idx++;
878 		}
879 		read_unlock(&qdisc_tree_lock);
880 	}
881 
882 done:
883 	read_unlock(&dev_base_lock);
884 
885 	cb->args[0] = idx;
886 	cb->args[1] = q_idx;
887 
888 	return skb->len;
889 }
890 
891 
892 
893 /************************************************
894  *	Traffic classes manipulation.		*
895  ************************************************/
896 
897 
898 
899 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
900 {
901 	struct tcmsg *tcm = NLMSG_DATA(n);
902 	struct rtattr **tca = arg;
903 	struct net_device *dev;
904 	struct Qdisc *q = NULL;
905 	struct Qdisc_class_ops *cops;
906 	unsigned long cl = 0;
907 	unsigned long new_cl;
908 	u32 pid = tcm->tcm_parent;
909 	u32 clid = tcm->tcm_handle;
910 	u32 qid = TC_H_MAJ(clid);
911 	int err;
912 
913 	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
914 		return -ENODEV;
915 
916 	/*
917 	   parent == TC_H_UNSPEC - unspecified parent.
918 	   parent == TC_H_ROOT   - class is root, which has no parent.
919 	   parent == X:0	 - parent is root class.
920 	   parent == X:Y	 - parent is a node in hierarchy.
921 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
922 
923 	   handle == 0:0	 - generate handle from kernel pool.
924 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
925 	   handle == X:Y	 - clear.
926 	   handle == X:0	 - root class.
927 	 */
928 
929 	/* Step 1. Determine qdisc handle X:0 */
930 
931 	if (pid != TC_H_ROOT) {
932 		u32 qid1 = TC_H_MAJ(pid);
933 
934 		if (qid && qid1) {
935 			/* If both majors are known, they must be identical. */
936 			if (qid != qid1)
937 				return -EINVAL;
938 		} else if (qid1) {
939 			qid = qid1;
940 		} else if (qid == 0)
941 			qid = dev->qdisc_sleeping->handle;
942 
943 		/* Now qid is genuine qdisc handle consistent
944 		   both with parent and child.
945 
946 		   TC_H_MAJ(pid) still may be unspecified, complete it now.
947 		 */
948 		if (pid)
949 			pid = TC_H_MAKE(qid, pid);
950 	} else {
951 		if (qid == 0)
952 			qid = dev->qdisc_sleeping->handle;
953 	}
954 
955 	/* OK. Locate qdisc */
956 	if ((q = qdisc_lookup(dev, qid)) == NULL)
957 		return -ENOENT;
958 
959 	/* An check that it supports classes */
960 	cops = q->ops->cl_ops;
961 	if (cops == NULL)
962 		return -EINVAL;
963 
964 	/* Now try to get class */
965 	if (clid == 0) {
966 		if (pid == TC_H_ROOT)
967 			clid = qid;
968 	} else
969 		clid = TC_H_MAKE(qid, clid);
970 
971 	if (clid)
972 		cl = cops->get(q, clid);
973 
974 	if (cl == 0) {
975 		err = -ENOENT;
976 		if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
977 			goto out;
978 	} else {
979 		switch (n->nlmsg_type) {
980 		case RTM_NEWTCLASS:
981 			err = -EEXIST;
982 			if (n->nlmsg_flags&NLM_F_EXCL)
983 				goto out;
984 			break;
985 		case RTM_DELTCLASS:
986 			err = cops->delete(q, cl);
987 			if (err == 0)
988 				tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
989 			goto out;
990 		case RTM_GETTCLASS:
991 			err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
992 			goto out;
993 		default:
994 			err = -EINVAL;
995 			goto out;
996 		}
997 	}
998 
999 	new_cl = cl;
1000 	err = cops->change(q, clid, pid, tca, &new_cl);
1001 	if (err == 0)
1002 		tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1003 
1004 out:
1005 	if (cl)
1006 		cops->put(q, cl);
1007 
1008 	return err;
1009 }
1010 
1011 
1012 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1013 			  unsigned long cl,
1014 			  u32 pid, u32 seq, u16 flags, int event)
1015 {
1016 	struct tcmsg *tcm;
1017 	struct nlmsghdr  *nlh;
1018 	unsigned char	 *b = skb->tail;
1019 	struct gnet_dump d;
1020 	struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1021 
1022 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1023 	tcm = NLMSG_DATA(nlh);
1024 	tcm->tcm_family = AF_UNSPEC;
1025 	tcm->tcm_ifindex = q->dev->ifindex;
1026 	tcm->tcm_parent = q->handle;
1027 	tcm->tcm_handle = q->handle;
1028 	tcm->tcm_info = 0;
1029 	RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
1030 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1031 		goto rtattr_failure;
1032 
1033 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
1034 			TCA_XSTATS, q->stats_lock, &d) < 0)
1035 		goto rtattr_failure;
1036 
1037 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1038 		goto rtattr_failure;
1039 
1040 	if (gnet_stats_finish_copy(&d) < 0)
1041 		goto rtattr_failure;
1042 
1043 	nlh->nlmsg_len = skb->tail - b;
1044 	return skb->len;
1045 
1046 nlmsg_failure:
1047 rtattr_failure:
1048 	skb_trim(skb, b - skb->data);
1049 	return -1;
1050 }
1051 
1052 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1053 			  struct Qdisc *q, unsigned long cl, int event)
1054 {
1055 	struct sk_buff *skb;
1056 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1057 
1058 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1059 	if (!skb)
1060 		return -ENOBUFS;
1061 
1062 	if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1063 		kfree_skb(skb);
1064 		return -EINVAL;
1065 	}
1066 
1067 	return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1068 }
1069 
1070 struct qdisc_dump_args
1071 {
1072 	struct qdisc_walker w;
1073 	struct sk_buff *skb;
1074 	struct netlink_callback *cb;
1075 };
1076 
1077 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1078 {
1079 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1080 
1081 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1082 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1083 }
1084 
1085 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1086 {
1087 	int t;
1088 	int s_t;
1089 	struct net_device *dev;
1090 	struct Qdisc *q;
1091 	struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1092 	struct qdisc_dump_args arg;
1093 
1094 	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1095 		return 0;
1096 	if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1097 		return 0;
1098 
1099 	s_t = cb->args[0];
1100 	t = 0;
1101 
1102 	read_lock(&qdisc_tree_lock);
1103 	list_for_each_entry(q, &dev->qdisc_list, list) {
1104 		if (t < s_t || !q->ops->cl_ops ||
1105 		    (tcm->tcm_parent &&
1106 		     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1107 			t++;
1108 			continue;
1109 		}
1110 		if (t > s_t)
1111 			memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1112 		arg.w.fn = qdisc_class_dump;
1113 		arg.skb = skb;
1114 		arg.cb = cb;
1115 		arg.w.stop  = 0;
1116 		arg.w.skip = cb->args[1];
1117 		arg.w.count = 0;
1118 		q->ops->cl_ops->walk(q, &arg.w);
1119 		cb->args[1] = arg.w.count;
1120 		if (arg.w.stop)
1121 			break;
1122 		t++;
1123 	}
1124 	read_unlock(&qdisc_tree_lock);
1125 
1126 	cb->args[0] = t;
1127 
1128 	dev_put(dev);
1129 	return skb->len;
1130 }
1131 
1132 /* Main classifier routine: scans classifier chain attached
1133    to this qdisc, (optionally) tests for protocol and asks
1134    specific classifiers.
1135  */
1136 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1137 	struct tcf_result *res)
1138 {
1139 	int err = 0;
1140 	__be16 protocol = skb->protocol;
1141 #ifdef CONFIG_NET_CLS_ACT
1142 	struct tcf_proto *otp = tp;
1143 reclassify:
1144 #endif
1145 	protocol = skb->protocol;
1146 
1147 	for ( ; tp; tp = tp->next) {
1148 		if ((tp->protocol == protocol ||
1149 			tp->protocol == __constant_htons(ETH_P_ALL)) &&
1150 			(err = tp->classify(skb, tp, res)) >= 0) {
1151 #ifdef CONFIG_NET_CLS_ACT
1152 			if ( TC_ACT_RECLASSIFY == err) {
1153 				__u32 verd = (__u32) G_TC_VERD(skb->tc_verd);
1154 				tp = otp;
1155 
1156 				if (MAX_REC_LOOP < verd++) {
1157 					printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
1158 						tp->prio&0xffff, ntohs(tp->protocol));
1159 					return TC_ACT_SHOT;
1160 				}
1161 				skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd);
1162 				goto reclassify;
1163 			} else {
1164 				if (skb->tc_verd)
1165 					skb->tc_verd = SET_TC_VERD(skb->tc_verd,0);
1166 				return err;
1167 			}
1168 #else
1169 
1170 			return err;
1171 #endif
1172 		}
1173 
1174 	}
1175 	return -1;
1176 }
1177 
1178 static int psched_us_per_tick = 1;
1179 static int psched_tick_per_us = 1;
1180 
1181 #ifdef CONFIG_PROC_FS
1182 static int psched_show(struct seq_file *seq, void *v)
1183 {
1184 	seq_printf(seq, "%08x %08x %08x %08x\n",
1185 		      psched_tick_per_us, psched_us_per_tick,
1186 		      1000000, HZ);
1187 
1188 	return 0;
1189 }
1190 
1191 static int psched_open(struct inode *inode, struct file *file)
1192 {
1193 	return single_open(file, psched_show, PDE(inode)->data);
1194 }
1195 
1196 static const struct file_operations psched_fops = {
1197 	.owner = THIS_MODULE,
1198 	.open = psched_open,
1199 	.read  = seq_read,
1200 	.llseek = seq_lseek,
1201 	.release = single_release,
1202 };
1203 #endif
1204 
1205 #ifdef CONFIG_NET_SCH_CLK_CPU
1206 psched_tdiff_t psched_clock_per_hz;
1207 int psched_clock_scale;
1208 EXPORT_SYMBOL(psched_clock_per_hz);
1209 EXPORT_SYMBOL(psched_clock_scale);
1210 
1211 psched_time_t psched_time_base;
1212 cycles_t psched_time_mark;
1213 EXPORT_SYMBOL(psched_time_mark);
1214 EXPORT_SYMBOL(psched_time_base);
1215 
1216 /*
1217  * Periodically adjust psched_time_base to avoid overflow
1218  * with 32-bit get_cycles(). Safe up to 4GHz CPU.
1219  */
1220 static void psched_tick(unsigned long);
1221 static DEFINE_TIMER(psched_timer, psched_tick, 0, 0);
1222 
1223 static void psched_tick(unsigned long dummy)
1224 {
1225 	if (sizeof(cycles_t) == sizeof(u32)) {
1226 		psched_time_t dummy_stamp;
1227 		PSCHED_GET_TIME(dummy_stamp);
1228 		psched_timer.expires = jiffies + 1*HZ;
1229 		add_timer(&psched_timer);
1230 	}
1231 }
1232 
1233 int __init psched_calibrate_clock(void)
1234 {
1235 	psched_time_t stamp, stamp1;
1236 	struct timeval tv, tv1;
1237 	psched_tdiff_t delay;
1238 	long rdelay;
1239 	unsigned long stop;
1240 
1241 	psched_tick(0);
1242 	stop = jiffies + HZ/10;
1243 	PSCHED_GET_TIME(stamp);
1244 	do_gettimeofday(&tv);
1245 	while (time_before(jiffies, stop)) {
1246 		barrier();
1247 		cpu_relax();
1248 	}
1249 	PSCHED_GET_TIME(stamp1);
1250 	do_gettimeofday(&tv1);
1251 
1252 	delay = PSCHED_TDIFF(stamp1, stamp);
1253 	rdelay = tv1.tv_usec - tv.tv_usec;
1254 	rdelay += (tv1.tv_sec - tv.tv_sec)*1000000;
1255 	if (rdelay > delay)
1256 		return -1;
1257 	delay /= rdelay;
1258 	psched_tick_per_us = delay;
1259 	while ((delay>>=1) != 0)
1260 		psched_clock_scale++;
1261 	psched_us_per_tick = 1<<psched_clock_scale;
1262 	psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
1263 	return 0;
1264 }
1265 #endif
1266 
1267 static int __init pktsched_init(void)
1268 {
1269 	struct rtnetlink_link *link_p;
1270 
1271 #ifdef CONFIG_NET_SCH_CLK_CPU
1272 	if (psched_calibrate_clock() < 0)
1273 		return -1;
1274 #elif defined(CONFIG_NET_SCH_CLK_JIFFIES)
1275 	psched_tick_per_us = HZ<<PSCHED_JSCALE;
1276 	psched_us_per_tick = 1000000;
1277 #endif
1278 
1279 	link_p = rtnetlink_links[PF_UNSPEC];
1280 
1281 	/* Setup rtnetlink links. It is made here to avoid
1282 	   exporting large number of public symbols.
1283 	 */
1284 
1285 	if (link_p) {
1286 		link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
1287 		link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
1288 		link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
1289 		link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
1290 		link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1291 		link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1292 		link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1293 		link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
1294 	}
1295 
1296 	register_qdisc(&pfifo_qdisc_ops);
1297 	register_qdisc(&bfifo_qdisc_ops);
1298 	proc_net_fops_create("psched", 0, &psched_fops);
1299 
1300 	return 0;
1301 }
1302 
1303 subsys_initcall(pktsched_init);
1304 
1305 EXPORT_SYMBOL(qdisc_get_rtab);
1306 EXPORT_SYMBOL(qdisc_put_rtab);
1307 EXPORT_SYMBOL(register_qdisc);
1308 EXPORT_SYMBOL(unregister_qdisc);
1309 EXPORT_SYMBOL(tc_classify);
1310