xref: /linux/net/sched/sch_api.c (revision 606d099cdd1080bbb50ea50dc52d98252f8f10a1)
1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/sched.h>
22 #include <linux/string.h>
23 #include <linux/mm.h>
24 #include <linux/socket.h>
25 #include <linux/sockios.h>
26 #include <linux/in.h>
27 #include <linux/errno.h>
28 #include <linux/interrupt.h>
29 #include <linux/netdevice.h>
30 #include <linux/skbuff.h>
31 #include <linux/rtnetlink.h>
32 #include <linux/init.h>
33 #include <linux/proc_fs.h>
34 #include <linux/seq_file.h>
35 #include <linux/kmod.h>
36 #include <linux/list.h>
37 #include <linux/bitops.h>
38 
39 #include <net/sock.h>
40 #include <net/pkt_sched.h>
41 
42 #include <asm/processor.h>
43 #include <asm/uaccess.h>
44 #include <asm/system.h>
45 
46 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
47 			struct Qdisc *old, struct Qdisc *new);
48 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
49 			 struct Qdisc *q, unsigned long cl, int event);
50 
51 /*
52 
53    Short review.
54    -------------
55 
56    This file consists of two interrelated parts:
57 
58    1. queueing disciplines manager frontend.
59    2. traffic classes manager frontend.
60 
61    Generally, queueing discipline ("qdisc") is a black box,
62    which is able to enqueue packets and to dequeue them (when
63    device is ready to send something) in order and at times
64    determined by algorithm hidden in it.
65 
66    qdisc's are divided to two categories:
67    - "queues", which have no internal structure visible from outside.
68    - "schedulers", which split all the packets to "traffic classes",
69      using "packet classifiers" (look at cls_api.c)
70 
71    In turn, classes may have child qdiscs (as rule, queues)
72    attached to them etc. etc. etc.
73 
74    The goal of the routines in this file is to translate
75    information supplied by user in the form of handles
76    to more intelligible for kernel form, to make some sanity
77    checks and part of work, which is common to all qdiscs
78    and to provide rtnetlink notifications.
79 
80    All real intelligent work is done inside qdisc modules.
81 
82 
83 
84    Every discipline has two major routines: enqueue and dequeue.
85 
86    ---dequeue
87 
88    dequeue usually returns a skb to send. It is allowed to return NULL,
89    but it does not mean that queue is empty, it just means that
90    discipline does not want to send anything this time.
91    Queue is really empty if q->q.qlen == 0.
92    For complicated disciplines with multiple queues q->q is not
93    real packet queue, but however q->q.qlen must be valid.
94 
95    ---enqueue
96 
97    enqueue returns 0, if packet was enqueued successfully.
98    If packet (this one or another one) was dropped, it returns
99    not zero error code.
100    NET_XMIT_DROP 	- this packet dropped
101      Expected action: do not backoff, but wait until queue will clear.
102    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
103      Expected action: backoff or ignore
104    NET_XMIT_POLICED	- dropped by police.
105      Expected action: backoff or error to real-time apps.
106 
107    Auxiliary routines:
108 
109    ---requeue
110 
111    requeues once dequeued packet. It is used for non-standard or
112    just buggy devices, which can defer output even if dev->tbusy=0.
113 
114    ---reset
115 
116    returns qdisc to initial state: purge all buffers, clear all
117    timers, counters (except for statistics) etc.
118 
119    ---init
120 
121    initializes newly created qdisc.
122 
123    ---destroy
124 
125    destroys resources allocated by init and during lifetime of qdisc.
126 
127    ---change
128 
129    changes qdisc parameters.
130  */
131 
132 /* Protects list of registered TC modules. It is pure SMP lock. */
133 static DEFINE_RWLOCK(qdisc_mod_lock);
134 
135 
136 /************************************************
137  *	Queueing disciplines manipulation.	*
138  ************************************************/
139 
140 
141 /* The list of all installed queueing disciplines. */
142 
143 static struct Qdisc_ops *qdisc_base;
144 
145 /* Register/uregister queueing discipline */
146 
147 int register_qdisc(struct Qdisc_ops *qops)
148 {
149 	struct Qdisc_ops *q, **qp;
150 	int rc = -EEXIST;
151 
152 	write_lock(&qdisc_mod_lock);
153 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
154 		if (!strcmp(qops->id, q->id))
155 			goto out;
156 
157 	if (qops->enqueue == NULL)
158 		qops->enqueue = noop_qdisc_ops.enqueue;
159 	if (qops->requeue == NULL)
160 		qops->requeue = noop_qdisc_ops.requeue;
161 	if (qops->dequeue == NULL)
162 		qops->dequeue = noop_qdisc_ops.dequeue;
163 
164 	qops->next = NULL;
165 	*qp = qops;
166 	rc = 0;
167 out:
168 	write_unlock(&qdisc_mod_lock);
169 	return rc;
170 }
171 
172 int unregister_qdisc(struct Qdisc_ops *qops)
173 {
174 	struct Qdisc_ops *q, **qp;
175 	int err = -ENOENT;
176 
177 	write_lock(&qdisc_mod_lock);
178 	for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
179 		if (q == qops)
180 			break;
181 	if (q) {
182 		*qp = q->next;
183 		q->next = NULL;
184 		err = 0;
185 	}
186 	write_unlock(&qdisc_mod_lock);
187 	return err;
188 }
189 
190 /* We know handle. Find qdisc among all qdisc's attached to device
191    (root qdisc, all its children, children of children etc.)
192  */
193 
194 static struct Qdisc *__qdisc_lookup(struct net_device *dev, u32 handle)
195 {
196 	struct Qdisc *q;
197 
198 	list_for_each_entry(q, &dev->qdisc_list, list) {
199 		if (q->handle == handle)
200 			return q;
201 	}
202 	return NULL;
203 }
204 
205 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
206 {
207 	struct Qdisc *q;
208 
209 	read_lock(&qdisc_tree_lock);
210 	q = __qdisc_lookup(dev, handle);
211 	read_unlock(&qdisc_tree_lock);
212 	return q;
213 }
214 
215 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
216 {
217 	unsigned long cl;
218 	struct Qdisc *leaf;
219 	struct Qdisc_class_ops *cops = p->ops->cl_ops;
220 
221 	if (cops == NULL)
222 		return NULL;
223 	cl = cops->get(p, classid);
224 
225 	if (cl == 0)
226 		return NULL;
227 	leaf = cops->leaf(p, cl);
228 	cops->put(p, cl);
229 	return leaf;
230 }
231 
232 /* Find queueing discipline by name */
233 
234 static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
235 {
236 	struct Qdisc_ops *q = NULL;
237 
238 	if (kind) {
239 		read_lock(&qdisc_mod_lock);
240 		for (q = qdisc_base; q; q = q->next) {
241 			if (rtattr_strcmp(kind, q->id) == 0) {
242 				if (!try_module_get(q->owner))
243 					q = NULL;
244 				break;
245 			}
246 		}
247 		read_unlock(&qdisc_mod_lock);
248 	}
249 	return q;
250 }
251 
252 static struct qdisc_rate_table *qdisc_rtab_list;
253 
254 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
255 {
256 	struct qdisc_rate_table *rtab;
257 
258 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
259 		if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
260 			rtab->refcnt++;
261 			return rtab;
262 		}
263 	}
264 
265 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
266 		return NULL;
267 
268 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
269 	if (rtab) {
270 		rtab->rate = *r;
271 		rtab->refcnt = 1;
272 		memcpy(rtab->data, RTA_DATA(tab), 1024);
273 		rtab->next = qdisc_rtab_list;
274 		qdisc_rtab_list = rtab;
275 	}
276 	return rtab;
277 }
278 
279 void qdisc_put_rtab(struct qdisc_rate_table *tab)
280 {
281 	struct qdisc_rate_table *rtab, **rtabp;
282 
283 	if (!tab || --tab->refcnt)
284 		return;
285 
286 	for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
287 		if (rtab == tab) {
288 			*rtabp = rtab->next;
289 			kfree(rtab);
290 			return;
291 		}
292 	}
293 }
294 
295 
296 /* Allocate an unique handle from space managed by kernel */
297 
298 static u32 qdisc_alloc_handle(struct net_device *dev)
299 {
300 	int i = 0x10000;
301 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
302 
303 	do {
304 		autohandle += TC_H_MAKE(0x10000U, 0);
305 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
306 			autohandle = TC_H_MAKE(0x80000000U, 0);
307 	} while	(qdisc_lookup(dev, autohandle) && --i > 0);
308 
309 	return i>0 ? autohandle : 0;
310 }
311 
312 /* Attach toplevel qdisc to device dev */
313 
314 static struct Qdisc *
315 dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
316 {
317 	struct Qdisc *oqdisc;
318 
319 	if (dev->flags & IFF_UP)
320 		dev_deactivate(dev);
321 
322 	qdisc_lock_tree(dev);
323 	if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
324 		oqdisc = dev->qdisc_ingress;
325 		/* Prune old scheduler */
326 		if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
327 			/* delete */
328 			qdisc_reset(oqdisc);
329 			dev->qdisc_ingress = NULL;
330 		} else {  /* new */
331 			dev->qdisc_ingress = qdisc;
332 		}
333 
334 	} else {
335 
336 		oqdisc = dev->qdisc_sleeping;
337 
338 		/* Prune old scheduler */
339 		if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
340 			qdisc_reset(oqdisc);
341 
342 		/* ... and graft new one */
343 		if (qdisc == NULL)
344 			qdisc = &noop_qdisc;
345 		dev->qdisc_sleeping = qdisc;
346 		dev->qdisc = &noop_qdisc;
347 	}
348 
349 	qdisc_unlock_tree(dev);
350 
351 	if (dev->flags & IFF_UP)
352 		dev_activate(dev);
353 
354 	return oqdisc;
355 }
356 
357 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
358 {
359 	struct Qdisc_class_ops *cops;
360 	unsigned long cl;
361 	u32 parentid;
362 
363 	if (n == 0)
364 		return;
365 	while ((parentid = sch->parent)) {
366 		sch = __qdisc_lookup(sch->dev, TC_H_MAJ(parentid));
367 		cops = sch->ops->cl_ops;
368 		if (cops->qlen_notify) {
369 			cl = cops->get(sch, parentid);
370 			cops->qlen_notify(sch, cl);
371 			cops->put(sch, cl);
372 		}
373 		sch->q.qlen -= n;
374 	}
375 }
376 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
377 
378 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
379    to device "dev".
380 
381    Old qdisc is not destroyed but returned in *old.
382  */
383 
384 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
385 		       u32 classid,
386 		       struct Qdisc *new, struct Qdisc **old)
387 {
388 	int err = 0;
389 	struct Qdisc *q = *old;
390 
391 
392 	if (parent == NULL) {
393 		if (q && q->flags&TCQ_F_INGRESS) {
394 			*old = dev_graft_qdisc(dev, q);
395 		} else {
396 			*old = dev_graft_qdisc(dev, new);
397 		}
398 	} else {
399 		struct Qdisc_class_ops *cops = parent->ops->cl_ops;
400 
401 		err = -EINVAL;
402 
403 		if (cops) {
404 			unsigned long cl = cops->get(parent, classid);
405 			if (cl) {
406 				err = cops->graft(parent, cl, new, old);
407 				if (new)
408 					new->parent = classid;
409 				cops->put(parent, cl);
410 			}
411 		}
412 	}
413 	return err;
414 }
415 
416 /*
417    Allocate and initialize new qdisc.
418 
419    Parameters are passed via opt.
420  */
421 
422 static struct Qdisc *
423 qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
424 {
425 	int err;
426 	struct rtattr *kind = tca[TCA_KIND-1];
427 	struct Qdisc *sch;
428 	struct Qdisc_ops *ops;
429 
430 	ops = qdisc_lookup_ops(kind);
431 #ifdef CONFIG_KMOD
432 	if (ops == NULL && kind != NULL) {
433 		char name[IFNAMSIZ];
434 		if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
435 			/* We dropped the RTNL semaphore in order to
436 			 * perform the module load.  So, even if we
437 			 * succeeded in loading the module we have to
438 			 * tell the caller to replay the request.  We
439 			 * indicate this using -EAGAIN.
440 			 * We replay the request because the device may
441 			 * go away in the mean time.
442 			 */
443 			rtnl_unlock();
444 			request_module("sch_%s", name);
445 			rtnl_lock();
446 			ops = qdisc_lookup_ops(kind);
447 			if (ops != NULL) {
448 				/* We will try again qdisc_lookup_ops,
449 				 * so don't keep a reference.
450 				 */
451 				module_put(ops->owner);
452 				err = -EAGAIN;
453 				goto err_out;
454 			}
455 		}
456 	}
457 #endif
458 
459 	err = -ENOENT;
460 	if (ops == NULL)
461 		goto err_out;
462 
463 	sch = qdisc_alloc(dev, ops);
464 	if (IS_ERR(sch)) {
465 		err = PTR_ERR(sch);
466 		goto err_out2;
467 	}
468 
469 	if (handle == TC_H_INGRESS) {
470 		sch->flags |= TCQ_F_INGRESS;
471 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
472 	} else if (handle == 0) {
473 		handle = qdisc_alloc_handle(dev);
474 		err = -ENOMEM;
475 		if (handle == 0)
476 			goto err_out3;
477 	}
478 
479 	sch->handle = handle;
480 
481 	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
482 #ifdef CONFIG_NET_ESTIMATOR
483 		if (tca[TCA_RATE-1]) {
484 			err = gen_new_estimator(&sch->bstats, &sch->rate_est,
485 						sch->stats_lock,
486 						tca[TCA_RATE-1]);
487 			if (err) {
488 				/*
489 				 * Any broken qdiscs that would require
490 				 * a ops->reset() here? The qdisc was never
491 				 * in action so it shouldn't be necessary.
492 				 */
493 				if (ops->destroy)
494 					ops->destroy(sch);
495 				goto err_out3;
496 			}
497 		}
498 #endif
499 		qdisc_lock_tree(dev);
500 		list_add_tail(&sch->list, &dev->qdisc_list);
501 		qdisc_unlock_tree(dev);
502 
503 		return sch;
504 	}
505 err_out3:
506 	dev_put(dev);
507 	kfree((char *) sch - sch->padded);
508 err_out2:
509 	module_put(ops->owner);
510 err_out:
511 	*errp = err;
512 	return NULL;
513 }
514 
515 static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
516 {
517 	if (tca[TCA_OPTIONS-1]) {
518 		int err;
519 
520 		if (sch->ops->change == NULL)
521 			return -EINVAL;
522 		err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
523 		if (err)
524 			return err;
525 	}
526 #ifdef CONFIG_NET_ESTIMATOR
527 	if (tca[TCA_RATE-1])
528 		gen_replace_estimator(&sch->bstats, &sch->rate_est,
529 			sch->stats_lock, tca[TCA_RATE-1]);
530 #endif
531 	return 0;
532 }
533 
534 struct check_loop_arg
535 {
536 	struct qdisc_walker 	w;
537 	struct Qdisc		*p;
538 	int			depth;
539 };
540 
541 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
542 
543 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
544 {
545 	struct check_loop_arg	arg;
546 
547 	if (q->ops->cl_ops == NULL)
548 		return 0;
549 
550 	arg.w.stop = arg.w.skip = arg.w.count = 0;
551 	arg.w.fn = check_loop_fn;
552 	arg.depth = depth;
553 	arg.p = p;
554 	q->ops->cl_ops->walk(q, &arg.w);
555 	return arg.w.stop ? -ELOOP : 0;
556 }
557 
558 static int
559 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
560 {
561 	struct Qdisc *leaf;
562 	struct Qdisc_class_ops *cops = q->ops->cl_ops;
563 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
564 
565 	leaf = cops->leaf(q, cl);
566 	if (leaf) {
567 		if (leaf == arg->p || arg->depth > 7)
568 			return -ELOOP;
569 		return check_loop(leaf, arg->p, arg->depth + 1);
570 	}
571 	return 0;
572 }
573 
574 /*
575  * Delete/get qdisc.
576  */
577 
578 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
579 {
580 	struct tcmsg *tcm = NLMSG_DATA(n);
581 	struct rtattr **tca = arg;
582 	struct net_device *dev;
583 	u32 clid = tcm->tcm_parent;
584 	struct Qdisc *q = NULL;
585 	struct Qdisc *p = NULL;
586 	int err;
587 
588 	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
589 		return -ENODEV;
590 
591 	if (clid) {
592 		if (clid != TC_H_ROOT) {
593 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
594 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
595 					return -ENOENT;
596 				q = qdisc_leaf(p, clid);
597 			} else { /* ingress */
598 				q = dev->qdisc_ingress;
599                         }
600 		} else {
601 			q = dev->qdisc_sleeping;
602 		}
603 		if (!q)
604 			return -ENOENT;
605 
606 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
607 			return -EINVAL;
608 	} else {
609 		if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
610 			return -ENOENT;
611 	}
612 
613 	if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
614 		return -EINVAL;
615 
616 	if (n->nlmsg_type == RTM_DELQDISC) {
617 		if (!clid)
618 			return -EINVAL;
619 		if (q->handle == 0)
620 			return -ENOENT;
621 		if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
622 			return err;
623 		if (q) {
624 			qdisc_notify(skb, n, clid, q, NULL);
625 			spin_lock_bh(&dev->queue_lock);
626 			qdisc_destroy(q);
627 			spin_unlock_bh(&dev->queue_lock);
628 		}
629 	} else {
630 		qdisc_notify(skb, n, clid, NULL, q);
631 	}
632 	return 0;
633 }
634 
635 /*
636    Create/change qdisc.
637  */
638 
639 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
640 {
641 	struct tcmsg *tcm;
642 	struct rtattr **tca;
643 	struct net_device *dev;
644 	u32 clid;
645 	struct Qdisc *q, *p;
646 	int err;
647 
648 replay:
649 	/* Reinit, just in case something touches this. */
650 	tcm = NLMSG_DATA(n);
651 	tca = arg;
652 	clid = tcm->tcm_parent;
653 	q = p = NULL;
654 
655 	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
656 		return -ENODEV;
657 
658 	if (clid) {
659 		if (clid != TC_H_ROOT) {
660 			if (clid != TC_H_INGRESS) {
661 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
662 					return -ENOENT;
663 				q = qdisc_leaf(p, clid);
664 			} else { /*ingress */
665 				q = dev->qdisc_ingress;
666 			}
667 		} else {
668 			q = dev->qdisc_sleeping;
669 		}
670 
671 		/* It may be default qdisc, ignore it */
672 		if (q && q->handle == 0)
673 			q = NULL;
674 
675 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
676 			if (tcm->tcm_handle) {
677 				if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
678 					return -EEXIST;
679 				if (TC_H_MIN(tcm->tcm_handle))
680 					return -EINVAL;
681 				if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
682 					goto create_n_graft;
683 				if (n->nlmsg_flags&NLM_F_EXCL)
684 					return -EEXIST;
685 				if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
686 					return -EINVAL;
687 				if (q == p ||
688 				    (p && check_loop(q, p, 0)))
689 					return -ELOOP;
690 				atomic_inc(&q->refcnt);
691 				goto graft;
692 			} else {
693 				if (q == NULL)
694 					goto create_n_graft;
695 
696 				/* This magic test requires explanation.
697 				 *
698 				 *   We know, that some child q is already
699 				 *   attached to this parent and have choice:
700 				 *   either to change it or to create/graft new one.
701 				 *
702 				 *   1. We are allowed to create/graft only
703 				 *   if CREATE and REPLACE flags are set.
704 				 *
705 				 *   2. If EXCL is set, requestor wanted to say,
706 				 *   that qdisc tcm_handle is not expected
707 				 *   to exist, so that we choose create/graft too.
708 				 *
709 				 *   3. The last case is when no flags are set.
710 				 *   Alas, it is sort of hole in API, we
711 				 *   cannot decide what to do unambiguously.
712 				 *   For now we select create/graft, if
713 				 *   user gave KIND, which does not match existing.
714 				 */
715 				if ((n->nlmsg_flags&NLM_F_CREATE) &&
716 				    (n->nlmsg_flags&NLM_F_REPLACE) &&
717 				    ((n->nlmsg_flags&NLM_F_EXCL) ||
718 				     (tca[TCA_KIND-1] &&
719 				      rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
720 					goto create_n_graft;
721 			}
722 		}
723 	} else {
724 		if (!tcm->tcm_handle)
725 			return -EINVAL;
726 		q = qdisc_lookup(dev, tcm->tcm_handle);
727 	}
728 
729 	/* Change qdisc parameters */
730 	if (q == NULL)
731 		return -ENOENT;
732 	if (n->nlmsg_flags&NLM_F_EXCL)
733 		return -EEXIST;
734 	if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
735 		return -EINVAL;
736 	err = qdisc_change(q, tca);
737 	if (err == 0)
738 		qdisc_notify(skb, n, clid, NULL, q);
739 	return err;
740 
741 create_n_graft:
742 	if (!(n->nlmsg_flags&NLM_F_CREATE))
743 		return -ENOENT;
744 	if (clid == TC_H_INGRESS)
745 		q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
746         else
747 		q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
748 	if (q == NULL) {
749 		if (err == -EAGAIN)
750 			goto replay;
751 		return err;
752 	}
753 
754 graft:
755 	if (1) {
756 		struct Qdisc *old_q = NULL;
757 		err = qdisc_graft(dev, p, clid, q, &old_q);
758 		if (err) {
759 			if (q) {
760 				spin_lock_bh(&dev->queue_lock);
761 				qdisc_destroy(q);
762 				spin_unlock_bh(&dev->queue_lock);
763 			}
764 			return err;
765 		}
766 		qdisc_notify(skb, n, clid, old_q, q);
767 		if (old_q) {
768 			spin_lock_bh(&dev->queue_lock);
769 			qdisc_destroy(old_q);
770 			spin_unlock_bh(&dev->queue_lock);
771 		}
772 	}
773 	return 0;
774 }
775 
776 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
777 			 u32 pid, u32 seq, u16 flags, int event)
778 {
779 	struct tcmsg *tcm;
780 	struct nlmsghdr  *nlh;
781 	unsigned char	 *b = skb->tail;
782 	struct gnet_dump d;
783 
784 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
785 	tcm = NLMSG_DATA(nlh);
786 	tcm->tcm_family = AF_UNSPEC;
787 	tcm->tcm__pad1 = 0;
788 	tcm->tcm__pad2 = 0;
789 	tcm->tcm_ifindex = q->dev->ifindex;
790 	tcm->tcm_parent = clid;
791 	tcm->tcm_handle = q->handle;
792 	tcm->tcm_info = atomic_read(&q->refcnt);
793 	RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
794 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
795 		goto rtattr_failure;
796 	q->qstats.qlen = q->q.qlen;
797 
798 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
799 			TCA_XSTATS, q->stats_lock, &d) < 0)
800 		goto rtattr_failure;
801 
802 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
803 		goto rtattr_failure;
804 
805 	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
806 #ifdef CONFIG_NET_ESTIMATOR
807 	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
808 #endif
809 	    gnet_stats_copy_queue(&d, &q->qstats) < 0)
810 		goto rtattr_failure;
811 
812 	if (gnet_stats_finish_copy(&d) < 0)
813 		goto rtattr_failure;
814 
815 	nlh->nlmsg_len = skb->tail - b;
816 	return skb->len;
817 
818 nlmsg_failure:
819 rtattr_failure:
820 	skb_trim(skb, b - skb->data);
821 	return -1;
822 }
823 
824 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
825 			u32 clid, struct Qdisc *old, struct Qdisc *new)
826 {
827 	struct sk_buff *skb;
828 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
829 
830 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
831 	if (!skb)
832 		return -ENOBUFS;
833 
834 	if (old && old->handle) {
835 		if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
836 			goto err_out;
837 	}
838 	if (new) {
839 		if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
840 			goto err_out;
841 	}
842 
843 	if (skb->len)
844 		return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
845 
846 err_out:
847 	kfree_skb(skb);
848 	return -EINVAL;
849 }
850 
851 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
852 {
853 	int idx, q_idx;
854 	int s_idx, s_q_idx;
855 	struct net_device *dev;
856 	struct Qdisc *q;
857 
858 	s_idx = cb->args[0];
859 	s_q_idx = q_idx = cb->args[1];
860 	read_lock(&dev_base_lock);
861 	for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
862 		if (idx < s_idx)
863 			continue;
864 		if (idx > s_idx)
865 			s_q_idx = 0;
866 		read_lock(&qdisc_tree_lock);
867 		q_idx = 0;
868 		list_for_each_entry(q, &dev->qdisc_list, list) {
869 			if (q_idx < s_q_idx) {
870 				q_idx++;
871 				continue;
872 			}
873 			if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
874 					  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
875 				read_unlock(&qdisc_tree_lock);
876 				goto done;
877 			}
878 			q_idx++;
879 		}
880 		read_unlock(&qdisc_tree_lock);
881 	}
882 
883 done:
884 	read_unlock(&dev_base_lock);
885 
886 	cb->args[0] = idx;
887 	cb->args[1] = q_idx;
888 
889 	return skb->len;
890 }
891 
892 
893 
894 /************************************************
895  *	Traffic classes manipulation.		*
896  ************************************************/
897 
898 
899 
900 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
901 {
902 	struct tcmsg *tcm = NLMSG_DATA(n);
903 	struct rtattr **tca = arg;
904 	struct net_device *dev;
905 	struct Qdisc *q = NULL;
906 	struct Qdisc_class_ops *cops;
907 	unsigned long cl = 0;
908 	unsigned long new_cl;
909 	u32 pid = tcm->tcm_parent;
910 	u32 clid = tcm->tcm_handle;
911 	u32 qid = TC_H_MAJ(clid);
912 	int err;
913 
914 	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
915 		return -ENODEV;
916 
917 	/*
918 	   parent == TC_H_UNSPEC - unspecified parent.
919 	   parent == TC_H_ROOT   - class is root, which has no parent.
920 	   parent == X:0	 - parent is root class.
921 	   parent == X:Y	 - parent is a node in hierarchy.
922 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
923 
924 	   handle == 0:0	 - generate handle from kernel pool.
925 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
926 	   handle == X:Y	 - clear.
927 	   handle == X:0	 - root class.
928 	 */
929 
930 	/* Step 1. Determine qdisc handle X:0 */
931 
932 	if (pid != TC_H_ROOT) {
933 		u32 qid1 = TC_H_MAJ(pid);
934 
935 		if (qid && qid1) {
936 			/* If both majors are known, they must be identical. */
937 			if (qid != qid1)
938 				return -EINVAL;
939 		} else if (qid1) {
940 			qid = qid1;
941 		} else if (qid == 0)
942 			qid = dev->qdisc_sleeping->handle;
943 
944 		/* Now qid is genuine qdisc handle consistent
945 		   both with parent and child.
946 
947 		   TC_H_MAJ(pid) still may be unspecified, complete it now.
948 		 */
949 		if (pid)
950 			pid = TC_H_MAKE(qid, pid);
951 	} else {
952 		if (qid == 0)
953 			qid = dev->qdisc_sleeping->handle;
954 	}
955 
956 	/* OK. Locate qdisc */
957 	if ((q = qdisc_lookup(dev, qid)) == NULL)
958 		return -ENOENT;
959 
960 	/* An check that it supports classes */
961 	cops = q->ops->cl_ops;
962 	if (cops == NULL)
963 		return -EINVAL;
964 
965 	/* Now try to get class */
966 	if (clid == 0) {
967 		if (pid == TC_H_ROOT)
968 			clid = qid;
969 	} else
970 		clid = TC_H_MAKE(qid, clid);
971 
972 	if (clid)
973 		cl = cops->get(q, clid);
974 
975 	if (cl == 0) {
976 		err = -ENOENT;
977 		if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
978 			goto out;
979 	} else {
980 		switch (n->nlmsg_type) {
981 		case RTM_NEWTCLASS:
982 			err = -EEXIST;
983 			if (n->nlmsg_flags&NLM_F_EXCL)
984 				goto out;
985 			break;
986 		case RTM_DELTCLASS:
987 			err = cops->delete(q, cl);
988 			if (err == 0)
989 				tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
990 			goto out;
991 		case RTM_GETTCLASS:
992 			err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
993 			goto out;
994 		default:
995 			err = -EINVAL;
996 			goto out;
997 		}
998 	}
999 
1000 	new_cl = cl;
1001 	err = cops->change(q, clid, pid, tca, &new_cl);
1002 	if (err == 0)
1003 		tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1004 
1005 out:
1006 	if (cl)
1007 		cops->put(q, cl);
1008 
1009 	return err;
1010 }
1011 
1012 
1013 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1014 			  unsigned long cl,
1015 			  u32 pid, u32 seq, u16 flags, int event)
1016 {
1017 	struct tcmsg *tcm;
1018 	struct nlmsghdr  *nlh;
1019 	unsigned char	 *b = skb->tail;
1020 	struct gnet_dump d;
1021 	struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1022 
1023 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1024 	tcm = NLMSG_DATA(nlh);
1025 	tcm->tcm_family = AF_UNSPEC;
1026 	tcm->tcm_ifindex = q->dev->ifindex;
1027 	tcm->tcm_parent = q->handle;
1028 	tcm->tcm_handle = q->handle;
1029 	tcm->tcm_info = 0;
1030 	RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
1031 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1032 		goto rtattr_failure;
1033 
1034 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
1035 			TCA_XSTATS, q->stats_lock, &d) < 0)
1036 		goto rtattr_failure;
1037 
1038 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1039 		goto rtattr_failure;
1040 
1041 	if (gnet_stats_finish_copy(&d) < 0)
1042 		goto rtattr_failure;
1043 
1044 	nlh->nlmsg_len = skb->tail - b;
1045 	return skb->len;
1046 
1047 nlmsg_failure:
1048 rtattr_failure:
1049 	skb_trim(skb, b - skb->data);
1050 	return -1;
1051 }
1052 
1053 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1054 			  struct Qdisc *q, unsigned long cl, int event)
1055 {
1056 	struct sk_buff *skb;
1057 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1058 
1059 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1060 	if (!skb)
1061 		return -ENOBUFS;
1062 
1063 	if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1064 		kfree_skb(skb);
1065 		return -EINVAL;
1066 	}
1067 
1068 	return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1069 }
1070 
1071 struct qdisc_dump_args
1072 {
1073 	struct qdisc_walker w;
1074 	struct sk_buff *skb;
1075 	struct netlink_callback *cb;
1076 };
1077 
1078 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1079 {
1080 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1081 
1082 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1083 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1084 }
1085 
1086 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1087 {
1088 	int t;
1089 	int s_t;
1090 	struct net_device *dev;
1091 	struct Qdisc *q;
1092 	struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1093 	struct qdisc_dump_args arg;
1094 
1095 	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1096 		return 0;
1097 	if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1098 		return 0;
1099 
1100 	s_t = cb->args[0];
1101 	t = 0;
1102 
1103 	read_lock(&qdisc_tree_lock);
1104 	list_for_each_entry(q, &dev->qdisc_list, list) {
1105 		if (t < s_t || !q->ops->cl_ops ||
1106 		    (tcm->tcm_parent &&
1107 		     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1108 			t++;
1109 			continue;
1110 		}
1111 		if (t > s_t)
1112 			memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1113 		arg.w.fn = qdisc_class_dump;
1114 		arg.skb = skb;
1115 		arg.cb = cb;
1116 		arg.w.stop  = 0;
1117 		arg.w.skip = cb->args[1];
1118 		arg.w.count = 0;
1119 		q->ops->cl_ops->walk(q, &arg.w);
1120 		cb->args[1] = arg.w.count;
1121 		if (arg.w.stop)
1122 			break;
1123 		t++;
1124 	}
1125 	read_unlock(&qdisc_tree_lock);
1126 
1127 	cb->args[0] = t;
1128 
1129 	dev_put(dev);
1130 	return skb->len;
1131 }
1132 
1133 /* Main classifier routine: scans classifier chain attached
1134    to this qdisc, (optionally) tests for protocol and asks
1135    specific classifiers.
1136  */
1137 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1138 	struct tcf_result *res)
1139 {
1140 	int err = 0;
1141 	__be16 protocol = skb->protocol;
1142 #ifdef CONFIG_NET_CLS_ACT
1143 	struct tcf_proto *otp = tp;
1144 reclassify:
1145 #endif
1146 	protocol = skb->protocol;
1147 
1148 	for ( ; tp; tp = tp->next) {
1149 		if ((tp->protocol == protocol ||
1150 			tp->protocol == __constant_htons(ETH_P_ALL)) &&
1151 			(err = tp->classify(skb, tp, res)) >= 0) {
1152 #ifdef CONFIG_NET_CLS_ACT
1153 			if ( TC_ACT_RECLASSIFY == err) {
1154 				__u32 verd = (__u32) G_TC_VERD(skb->tc_verd);
1155 				tp = otp;
1156 
1157 				if (MAX_REC_LOOP < verd++) {
1158 					printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
1159 						tp->prio&0xffff, ntohs(tp->protocol));
1160 					return TC_ACT_SHOT;
1161 				}
1162 				skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd);
1163 				goto reclassify;
1164 			} else {
1165 				if (skb->tc_verd)
1166 					skb->tc_verd = SET_TC_VERD(skb->tc_verd,0);
1167 				return err;
1168 			}
1169 #else
1170 
1171 			return err;
1172 #endif
1173 		}
1174 
1175 	}
1176 	return -1;
1177 }
1178 
1179 static int psched_us_per_tick = 1;
1180 static int psched_tick_per_us = 1;
1181 
1182 #ifdef CONFIG_PROC_FS
1183 static int psched_show(struct seq_file *seq, void *v)
1184 {
1185 	seq_printf(seq, "%08x %08x %08x %08x\n",
1186 		      psched_tick_per_us, psched_us_per_tick,
1187 		      1000000, HZ);
1188 
1189 	return 0;
1190 }
1191 
1192 static int psched_open(struct inode *inode, struct file *file)
1193 {
1194 	return single_open(file, psched_show, PDE(inode)->data);
1195 }
1196 
1197 static struct file_operations psched_fops = {
1198 	.owner = THIS_MODULE,
1199 	.open = psched_open,
1200 	.read  = seq_read,
1201 	.llseek = seq_lseek,
1202 	.release = single_release,
1203 };
1204 #endif
1205 
1206 #ifdef CONFIG_NET_SCH_CLK_CPU
1207 psched_tdiff_t psched_clock_per_hz;
1208 int psched_clock_scale;
1209 EXPORT_SYMBOL(psched_clock_per_hz);
1210 EXPORT_SYMBOL(psched_clock_scale);
1211 
1212 psched_time_t psched_time_base;
1213 cycles_t psched_time_mark;
1214 EXPORT_SYMBOL(psched_time_mark);
1215 EXPORT_SYMBOL(psched_time_base);
1216 
1217 /*
1218  * Periodically adjust psched_time_base to avoid overflow
1219  * with 32-bit get_cycles(). Safe up to 4GHz CPU.
1220  */
1221 static void psched_tick(unsigned long);
1222 static DEFINE_TIMER(psched_timer, psched_tick, 0, 0);
1223 
1224 static void psched_tick(unsigned long dummy)
1225 {
1226 	if (sizeof(cycles_t) == sizeof(u32)) {
1227 		psched_time_t dummy_stamp;
1228 		PSCHED_GET_TIME(dummy_stamp);
1229 		psched_timer.expires = jiffies + 1*HZ;
1230 		add_timer(&psched_timer);
1231 	}
1232 }
1233 
1234 int __init psched_calibrate_clock(void)
1235 {
1236 	psched_time_t stamp, stamp1;
1237 	struct timeval tv, tv1;
1238 	psched_tdiff_t delay;
1239 	long rdelay;
1240 	unsigned long stop;
1241 
1242 	psched_tick(0);
1243 	stop = jiffies + HZ/10;
1244 	PSCHED_GET_TIME(stamp);
1245 	do_gettimeofday(&tv);
1246 	while (time_before(jiffies, stop)) {
1247 		barrier();
1248 		cpu_relax();
1249 	}
1250 	PSCHED_GET_TIME(stamp1);
1251 	do_gettimeofday(&tv1);
1252 
1253 	delay = PSCHED_TDIFF(stamp1, stamp);
1254 	rdelay = tv1.tv_usec - tv.tv_usec;
1255 	rdelay += (tv1.tv_sec - tv.tv_sec)*1000000;
1256 	if (rdelay > delay)
1257 		return -1;
1258 	delay /= rdelay;
1259 	psched_tick_per_us = delay;
1260 	while ((delay>>=1) != 0)
1261 		psched_clock_scale++;
1262 	psched_us_per_tick = 1<<psched_clock_scale;
1263 	psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
1264 	return 0;
1265 }
1266 #endif
1267 
1268 static int __init pktsched_init(void)
1269 {
1270 	struct rtnetlink_link *link_p;
1271 
1272 #ifdef CONFIG_NET_SCH_CLK_CPU
1273 	if (psched_calibrate_clock() < 0)
1274 		return -1;
1275 #elif defined(CONFIG_NET_SCH_CLK_JIFFIES)
1276 	psched_tick_per_us = HZ<<PSCHED_JSCALE;
1277 	psched_us_per_tick = 1000000;
1278 #endif
1279 
1280 	link_p = rtnetlink_links[PF_UNSPEC];
1281 
1282 	/* Setup rtnetlink links. It is made here to avoid
1283 	   exporting large number of public symbols.
1284 	 */
1285 
1286 	if (link_p) {
1287 		link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
1288 		link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
1289 		link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
1290 		link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
1291 		link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1292 		link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1293 		link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1294 		link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
1295 	}
1296 
1297 	register_qdisc(&pfifo_qdisc_ops);
1298 	register_qdisc(&bfifo_qdisc_ops);
1299 	proc_net_fops_create("psched", 0, &psched_fops);
1300 
1301 	return 0;
1302 }
1303 
1304 subsys_initcall(pktsched_init);
1305 
1306 EXPORT_SYMBOL(qdisc_get_rtab);
1307 EXPORT_SYMBOL(qdisc_put_rtab);
1308 EXPORT_SYMBOL(register_qdisc);
1309 EXPORT_SYMBOL(unregister_qdisc);
1310 EXPORT_SYMBOL(tc_classify);
1311