xref: /linux/net/sched/sch_api.c (revision f24e9f586b377749dff37554696cf3a105540c94)
1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/sched.h>
22 #include <linux/string.h>
23 #include <linux/mm.h>
24 #include <linux/socket.h>
25 #include <linux/sockios.h>
26 #include <linux/in.h>
27 #include <linux/errno.h>
28 #include <linux/interrupt.h>
29 #include <linux/netdevice.h>
30 #include <linux/skbuff.h>
31 #include <linux/rtnetlink.h>
32 #include <linux/init.h>
33 #include <linux/proc_fs.h>
34 #include <linux/seq_file.h>
35 #include <linux/kmod.h>
36 #include <linux/list.h>
37 #include <linux/bitops.h>
38 
39 #include <net/sock.h>
40 #include <net/pkt_sched.h>
41 
42 #include <asm/processor.h>
43 #include <asm/uaccess.h>
44 #include <asm/system.h>
45 
46 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
47 			struct Qdisc *old, struct Qdisc *new);
48 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
49 			 struct Qdisc *q, unsigned long cl, int event);
50 
51 /*
52 
53    Short review.
54    -------------
55 
56    This file consists of two interrelated parts:
57 
58    1. queueing disciplines manager frontend.
59    2. traffic classes manager frontend.
60 
61    Generally, queueing discipline ("qdisc") is a black box,
62    which is able to enqueue packets and to dequeue them (when
63    device is ready to send something) in order and at times
64    determined by algorithm hidden in it.
65 
66    qdisc's are divided to two categories:
67    - "queues", which have no internal structure visible from outside.
68    - "schedulers", which split all the packets to "traffic classes",
69      using "packet classifiers" (look at cls_api.c)
70 
71    In turn, classes may have child qdiscs (as rule, queues)
72    attached to them etc. etc. etc.
73 
74    The goal of the routines in this file is to translate
75    information supplied by user in the form of handles
76    to more intelligible for kernel form, to make some sanity
77    checks and part of work, which is common to all qdiscs
78    and to provide rtnetlink notifications.
79 
80    All real intelligent work is done inside qdisc modules.
81 
82 
83 
84    Every discipline has two major routines: enqueue and dequeue.
85 
86    ---dequeue
87 
88    dequeue usually returns a skb to send. It is allowed to return NULL,
89    but it does not mean that queue is empty, it just means that
90    discipline does not want to send anything this time.
91    Queue is really empty if q->q.qlen == 0.
92    For complicated disciplines with multiple queues q->q is not
93    real packet queue, but however q->q.qlen must be valid.
94 
95    ---enqueue
96 
97    enqueue returns 0, if packet was enqueued successfully.
98    If packet (this one or another one) was dropped, it returns
99    not zero error code.
100    NET_XMIT_DROP 	- this packet dropped
101      Expected action: do not backoff, but wait until queue will clear.
102    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
103      Expected action: backoff or ignore
104    NET_XMIT_POLICED	- dropped by police.
105      Expected action: backoff or error to real-time apps.
106 
107    Auxiliary routines:
108 
109    ---requeue
110 
111    requeues once dequeued packet. It is used for non-standard or
112    just buggy devices, which can defer output even if dev->tbusy=0.
113 
114    ---reset
115 
116    returns qdisc to initial state: purge all buffers, clear all
117    timers, counters (except for statistics) etc.
118 
119    ---init
120 
121    initializes newly created qdisc.
122 
123    ---destroy
124 
125    destroys resources allocated by init and during lifetime of qdisc.
126 
127    ---change
128 
129    changes qdisc parameters.
130  */
131 
132 /* Protects list of registered TC modules. It is pure SMP lock. */
133 static DEFINE_RWLOCK(qdisc_mod_lock);
134 
135 
136 /************************************************
137  *	Queueing disciplines manipulation.	*
138  ************************************************/
139 
140 
141 /* The list of all installed queueing disciplines. */
142 
143 static struct Qdisc_ops *qdisc_base;
144 
145 /* Register/uregister queueing discipline */
146 
147 int register_qdisc(struct Qdisc_ops *qops)
148 {
149 	struct Qdisc_ops *q, **qp;
150 	int rc = -EEXIST;
151 
152 	write_lock(&qdisc_mod_lock);
153 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
154 		if (!strcmp(qops->id, q->id))
155 			goto out;
156 
157 	if (qops->enqueue == NULL)
158 		qops->enqueue = noop_qdisc_ops.enqueue;
159 	if (qops->requeue == NULL)
160 		qops->requeue = noop_qdisc_ops.requeue;
161 	if (qops->dequeue == NULL)
162 		qops->dequeue = noop_qdisc_ops.dequeue;
163 
164 	qops->next = NULL;
165 	*qp = qops;
166 	rc = 0;
167 out:
168 	write_unlock(&qdisc_mod_lock);
169 	return rc;
170 }
171 
172 int unregister_qdisc(struct Qdisc_ops *qops)
173 {
174 	struct Qdisc_ops *q, **qp;
175 	int err = -ENOENT;
176 
177 	write_lock(&qdisc_mod_lock);
178 	for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
179 		if (q == qops)
180 			break;
181 	if (q) {
182 		*qp = q->next;
183 		q->next = NULL;
184 		err = 0;
185 	}
186 	write_unlock(&qdisc_mod_lock);
187 	return err;
188 }
189 
190 /* We know handle. Find qdisc among all qdisc's attached to device
191    (root qdisc, all its children, children of children etc.)
192  */
193 
194 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
195 {
196 	struct Qdisc *q;
197 
198 	read_lock_bh(&qdisc_tree_lock);
199 	list_for_each_entry(q, &dev->qdisc_list, list) {
200 		if (q->handle == handle) {
201 			read_unlock_bh(&qdisc_tree_lock);
202 			return q;
203 		}
204 	}
205 	read_unlock_bh(&qdisc_tree_lock);
206 	return NULL;
207 }
208 
209 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
210 {
211 	unsigned long cl;
212 	struct Qdisc *leaf;
213 	struct Qdisc_class_ops *cops = p->ops->cl_ops;
214 
215 	if (cops == NULL)
216 		return NULL;
217 	cl = cops->get(p, classid);
218 
219 	if (cl == 0)
220 		return NULL;
221 	leaf = cops->leaf(p, cl);
222 	cops->put(p, cl);
223 	return leaf;
224 }
225 
226 /* Find queueing discipline by name */
227 
228 static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
229 {
230 	struct Qdisc_ops *q = NULL;
231 
232 	if (kind) {
233 		read_lock(&qdisc_mod_lock);
234 		for (q = qdisc_base; q; q = q->next) {
235 			if (rtattr_strcmp(kind, q->id) == 0) {
236 				if (!try_module_get(q->owner))
237 					q = NULL;
238 				break;
239 			}
240 		}
241 		read_unlock(&qdisc_mod_lock);
242 	}
243 	return q;
244 }
245 
246 static struct qdisc_rate_table *qdisc_rtab_list;
247 
248 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
249 {
250 	struct qdisc_rate_table *rtab;
251 
252 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
253 		if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
254 			rtab->refcnt++;
255 			return rtab;
256 		}
257 	}
258 
259 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
260 		return NULL;
261 
262 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
263 	if (rtab) {
264 		rtab->rate = *r;
265 		rtab->refcnt = 1;
266 		memcpy(rtab->data, RTA_DATA(tab), 1024);
267 		rtab->next = qdisc_rtab_list;
268 		qdisc_rtab_list = rtab;
269 	}
270 	return rtab;
271 }
272 
273 void qdisc_put_rtab(struct qdisc_rate_table *tab)
274 {
275 	struct qdisc_rate_table *rtab, **rtabp;
276 
277 	if (!tab || --tab->refcnt)
278 		return;
279 
280 	for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
281 		if (rtab == tab) {
282 			*rtabp = rtab->next;
283 			kfree(rtab);
284 			return;
285 		}
286 	}
287 }
288 
289 
290 /* Allocate an unique handle from space managed by kernel */
291 
292 static u32 qdisc_alloc_handle(struct net_device *dev)
293 {
294 	int i = 0x10000;
295 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
296 
297 	do {
298 		autohandle += TC_H_MAKE(0x10000U, 0);
299 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
300 			autohandle = TC_H_MAKE(0x80000000U, 0);
301 	} while	(qdisc_lookup(dev, autohandle) && --i > 0);
302 
303 	return i>0 ? autohandle : 0;
304 }
305 
306 /* Attach toplevel qdisc to device dev */
307 
308 static struct Qdisc *
309 dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
310 {
311 	struct Qdisc *oqdisc;
312 
313 	if (dev->flags & IFF_UP)
314 		dev_deactivate(dev);
315 
316 	qdisc_lock_tree(dev);
317 	if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
318 		oqdisc = dev->qdisc_ingress;
319 		/* Prune old scheduler */
320 		if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
321 			/* delete */
322 			qdisc_reset(oqdisc);
323 			dev->qdisc_ingress = NULL;
324 		} else {  /* new */
325 			dev->qdisc_ingress = qdisc;
326 		}
327 
328 	} else {
329 
330 		oqdisc = dev->qdisc_sleeping;
331 
332 		/* Prune old scheduler */
333 		if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
334 			qdisc_reset(oqdisc);
335 
336 		/* ... and graft new one */
337 		if (qdisc == NULL)
338 			qdisc = &noop_qdisc;
339 		dev->qdisc_sleeping = qdisc;
340 		dev->qdisc = &noop_qdisc;
341 	}
342 
343 	qdisc_unlock_tree(dev);
344 
345 	if (dev->flags & IFF_UP)
346 		dev_activate(dev);
347 
348 	return oqdisc;
349 }
350 
351 
352 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
353    to device "dev".
354 
355    Old qdisc is not destroyed but returned in *old.
356  */
357 
358 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
359 		       u32 classid,
360 		       struct Qdisc *new, struct Qdisc **old)
361 {
362 	int err = 0;
363 	struct Qdisc *q = *old;
364 
365 
366 	if (parent == NULL) {
367 		if (q && q->flags&TCQ_F_INGRESS) {
368 			*old = dev_graft_qdisc(dev, q);
369 		} else {
370 			*old = dev_graft_qdisc(dev, new);
371 		}
372 	} else {
373 		struct Qdisc_class_ops *cops = parent->ops->cl_ops;
374 
375 		err = -EINVAL;
376 
377 		if (cops) {
378 			unsigned long cl = cops->get(parent, classid);
379 			if (cl) {
380 				err = cops->graft(parent, cl, new, old);
381 				if (new)
382 					new->parent = classid;
383 				cops->put(parent, cl);
384 			}
385 		}
386 	}
387 	return err;
388 }
389 
390 /*
391    Allocate and initialize new qdisc.
392 
393    Parameters are passed via opt.
394  */
395 
396 static struct Qdisc *
397 qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
398 {
399 	int err;
400 	struct rtattr *kind = tca[TCA_KIND-1];
401 	struct Qdisc *sch;
402 	struct Qdisc_ops *ops;
403 
404 	ops = qdisc_lookup_ops(kind);
405 #ifdef CONFIG_KMOD
406 	if (ops == NULL && kind != NULL) {
407 		char name[IFNAMSIZ];
408 		if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
409 			/* We dropped the RTNL semaphore in order to
410 			 * perform the module load.  So, even if we
411 			 * succeeded in loading the module we have to
412 			 * tell the caller to replay the request.  We
413 			 * indicate this using -EAGAIN.
414 			 * We replay the request because the device may
415 			 * go away in the mean time.
416 			 */
417 			rtnl_unlock();
418 			request_module("sch_%s", name);
419 			rtnl_lock();
420 			ops = qdisc_lookup_ops(kind);
421 			if (ops != NULL) {
422 				/* We will try again qdisc_lookup_ops,
423 				 * so don't keep a reference.
424 				 */
425 				module_put(ops->owner);
426 				err = -EAGAIN;
427 				goto err_out;
428 			}
429 		}
430 	}
431 #endif
432 
433 	err = -ENOENT;
434 	if (ops == NULL)
435 		goto err_out;
436 
437 	sch = qdisc_alloc(dev, ops);
438 	if (IS_ERR(sch)) {
439 		err = PTR_ERR(sch);
440 		goto err_out2;
441 	}
442 
443 	if (handle == TC_H_INGRESS) {
444 		sch->flags |= TCQ_F_INGRESS;
445 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
446 	} else if (handle == 0) {
447 		handle = qdisc_alloc_handle(dev);
448 		err = -ENOMEM;
449 		if (handle == 0)
450 			goto err_out3;
451 	}
452 
453 	sch->handle = handle;
454 
455 	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
456 #ifdef CONFIG_NET_ESTIMATOR
457 		if (tca[TCA_RATE-1]) {
458 			err = gen_new_estimator(&sch->bstats, &sch->rate_est,
459 						sch->stats_lock,
460 						tca[TCA_RATE-1]);
461 			if (err) {
462 				/*
463 				 * Any broken qdiscs that would require
464 				 * a ops->reset() here? The qdisc was never
465 				 * in action so it shouldn't be necessary.
466 				 */
467 				if (ops->destroy)
468 					ops->destroy(sch);
469 				goto err_out3;
470 			}
471 		}
472 #endif
473 		qdisc_lock_tree(dev);
474 		list_add_tail(&sch->list, &dev->qdisc_list);
475 		qdisc_unlock_tree(dev);
476 
477 		return sch;
478 	}
479 err_out3:
480 	dev_put(dev);
481 	kfree((char *) sch - sch->padded);
482 err_out2:
483 	module_put(ops->owner);
484 err_out:
485 	*errp = err;
486 	return NULL;
487 }
488 
489 static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
490 {
491 	if (tca[TCA_OPTIONS-1]) {
492 		int err;
493 
494 		if (sch->ops->change == NULL)
495 			return -EINVAL;
496 		err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
497 		if (err)
498 			return err;
499 	}
500 #ifdef CONFIG_NET_ESTIMATOR
501 	if (tca[TCA_RATE-1])
502 		gen_replace_estimator(&sch->bstats, &sch->rate_est,
503 			sch->stats_lock, tca[TCA_RATE-1]);
504 #endif
505 	return 0;
506 }
507 
508 struct check_loop_arg
509 {
510 	struct qdisc_walker 	w;
511 	struct Qdisc		*p;
512 	int			depth;
513 };
514 
515 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
516 
517 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
518 {
519 	struct check_loop_arg	arg;
520 
521 	if (q->ops->cl_ops == NULL)
522 		return 0;
523 
524 	arg.w.stop = arg.w.skip = arg.w.count = 0;
525 	arg.w.fn = check_loop_fn;
526 	arg.depth = depth;
527 	arg.p = p;
528 	q->ops->cl_ops->walk(q, &arg.w);
529 	return arg.w.stop ? -ELOOP : 0;
530 }
531 
532 static int
533 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
534 {
535 	struct Qdisc *leaf;
536 	struct Qdisc_class_ops *cops = q->ops->cl_ops;
537 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
538 
539 	leaf = cops->leaf(q, cl);
540 	if (leaf) {
541 		if (leaf == arg->p || arg->depth > 7)
542 			return -ELOOP;
543 		return check_loop(leaf, arg->p, arg->depth + 1);
544 	}
545 	return 0;
546 }
547 
548 /*
549  * Delete/get qdisc.
550  */
551 
552 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
553 {
554 	struct tcmsg *tcm = NLMSG_DATA(n);
555 	struct rtattr **tca = arg;
556 	struct net_device *dev;
557 	u32 clid = tcm->tcm_parent;
558 	struct Qdisc *q = NULL;
559 	struct Qdisc *p = NULL;
560 	int err;
561 
562 	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
563 		return -ENODEV;
564 
565 	if (clid) {
566 		if (clid != TC_H_ROOT) {
567 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
568 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
569 					return -ENOENT;
570 				q = qdisc_leaf(p, clid);
571 			} else { /* ingress */
572 				q = dev->qdisc_ingress;
573                         }
574 		} else {
575 			q = dev->qdisc_sleeping;
576 		}
577 		if (!q)
578 			return -ENOENT;
579 
580 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
581 			return -EINVAL;
582 	} else {
583 		if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
584 			return -ENOENT;
585 	}
586 
587 	if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
588 		return -EINVAL;
589 
590 	if (n->nlmsg_type == RTM_DELQDISC) {
591 		if (!clid)
592 			return -EINVAL;
593 		if (q->handle == 0)
594 			return -ENOENT;
595 		if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
596 			return err;
597 		if (q) {
598 			qdisc_notify(skb, n, clid, q, NULL);
599 			spin_lock_bh(&dev->queue_lock);
600 			qdisc_destroy(q);
601 			spin_unlock_bh(&dev->queue_lock);
602 		}
603 	} else {
604 		qdisc_notify(skb, n, clid, NULL, q);
605 	}
606 	return 0;
607 }
608 
609 /*
610    Create/change qdisc.
611  */
612 
613 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
614 {
615 	struct tcmsg *tcm;
616 	struct rtattr **tca;
617 	struct net_device *dev;
618 	u32 clid;
619 	struct Qdisc *q, *p;
620 	int err;
621 
622 replay:
623 	/* Reinit, just in case something touches this. */
624 	tcm = NLMSG_DATA(n);
625 	tca = arg;
626 	clid = tcm->tcm_parent;
627 	q = p = NULL;
628 
629 	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
630 		return -ENODEV;
631 
632 	if (clid) {
633 		if (clid != TC_H_ROOT) {
634 			if (clid != TC_H_INGRESS) {
635 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
636 					return -ENOENT;
637 				q = qdisc_leaf(p, clid);
638 			} else { /*ingress */
639 				q = dev->qdisc_ingress;
640 			}
641 		} else {
642 			q = dev->qdisc_sleeping;
643 		}
644 
645 		/* It may be default qdisc, ignore it */
646 		if (q && q->handle == 0)
647 			q = NULL;
648 
649 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
650 			if (tcm->tcm_handle) {
651 				if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
652 					return -EEXIST;
653 				if (TC_H_MIN(tcm->tcm_handle))
654 					return -EINVAL;
655 				if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
656 					goto create_n_graft;
657 				if (n->nlmsg_flags&NLM_F_EXCL)
658 					return -EEXIST;
659 				if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
660 					return -EINVAL;
661 				if (q == p ||
662 				    (p && check_loop(q, p, 0)))
663 					return -ELOOP;
664 				atomic_inc(&q->refcnt);
665 				goto graft;
666 			} else {
667 				if (q == NULL)
668 					goto create_n_graft;
669 
670 				/* This magic test requires explanation.
671 				 *
672 				 *   We know, that some child q is already
673 				 *   attached to this parent and have choice:
674 				 *   either to change it or to create/graft new one.
675 				 *
676 				 *   1. We are allowed to create/graft only
677 				 *   if CREATE and REPLACE flags are set.
678 				 *
679 				 *   2. If EXCL is set, requestor wanted to say,
680 				 *   that qdisc tcm_handle is not expected
681 				 *   to exist, so that we choose create/graft too.
682 				 *
683 				 *   3. The last case is when no flags are set.
684 				 *   Alas, it is sort of hole in API, we
685 				 *   cannot decide what to do unambiguously.
686 				 *   For now we select create/graft, if
687 				 *   user gave KIND, which does not match existing.
688 				 */
689 				if ((n->nlmsg_flags&NLM_F_CREATE) &&
690 				    (n->nlmsg_flags&NLM_F_REPLACE) &&
691 				    ((n->nlmsg_flags&NLM_F_EXCL) ||
692 				     (tca[TCA_KIND-1] &&
693 				      rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
694 					goto create_n_graft;
695 			}
696 		}
697 	} else {
698 		if (!tcm->tcm_handle)
699 			return -EINVAL;
700 		q = qdisc_lookup(dev, tcm->tcm_handle);
701 	}
702 
703 	/* Change qdisc parameters */
704 	if (q == NULL)
705 		return -ENOENT;
706 	if (n->nlmsg_flags&NLM_F_EXCL)
707 		return -EEXIST;
708 	if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
709 		return -EINVAL;
710 	err = qdisc_change(q, tca);
711 	if (err == 0)
712 		qdisc_notify(skb, n, clid, NULL, q);
713 	return err;
714 
715 create_n_graft:
716 	if (!(n->nlmsg_flags&NLM_F_CREATE))
717 		return -ENOENT;
718 	if (clid == TC_H_INGRESS)
719 		q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
720         else
721 		q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
722 	if (q == NULL) {
723 		if (err == -EAGAIN)
724 			goto replay;
725 		return err;
726 	}
727 
728 graft:
729 	if (1) {
730 		struct Qdisc *old_q = NULL;
731 		err = qdisc_graft(dev, p, clid, q, &old_q);
732 		if (err) {
733 			if (q) {
734 				spin_lock_bh(&dev->queue_lock);
735 				qdisc_destroy(q);
736 				spin_unlock_bh(&dev->queue_lock);
737 			}
738 			return err;
739 		}
740 		qdisc_notify(skb, n, clid, old_q, q);
741 		if (old_q) {
742 			spin_lock_bh(&dev->queue_lock);
743 			qdisc_destroy(old_q);
744 			spin_unlock_bh(&dev->queue_lock);
745 		}
746 	}
747 	return 0;
748 }
749 
750 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
751 			 u32 pid, u32 seq, u16 flags, int event)
752 {
753 	struct tcmsg *tcm;
754 	struct nlmsghdr  *nlh;
755 	unsigned char	 *b = skb->tail;
756 	struct gnet_dump d;
757 
758 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
759 	tcm = NLMSG_DATA(nlh);
760 	tcm->tcm_family = AF_UNSPEC;
761 	tcm->tcm__pad1 = 0;
762 	tcm->tcm__pad2 = 0;
763 	tcm->tcm_ifindex = q->dev->ifindex;
764 	tcm->tcm_parent = clid;
765 	tcm->tcm_handle = q->handle;
766 	tcm->tcm_info = atomic_read(&q->refcnt);
767 	RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
768 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
769 		goto rtattr_failure;
770 	q->qstats.qlen = q->q.qlen;
771 
772 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
773 			TCA_XSTATS, q->stats_lock, &d) < 0)
774 		goto rtattr_failure;
775 
776 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
777 		goto rtattr_failure;
778 
779 	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
780 #ifdef CONFIG_NET_ESTIMATOR
781 	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
782 #endif
783 	    gnet_stats_copy_queue(&d, &q->qstats) < 0)
784 		goto rtattr_failure;
785 
786 	if (gnet_stats_finish_copy(&d) < 0)
787 		goto rtattr_failure;
788 
789 	nlh->nlmsg_len = skb->tail - b;
790 	return skb->len;
791 
792 nlmsg_failure:
793 rtattr_failure:
794 	skb_trim(skb, b - skb->data);
795 	return -1;
796 }
797 
798 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
799 			u32 clid, struct Qdisc *old, struct Qdisc *new)
800 {
801 	struct sk_buff *skb;
802 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
803 
804 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
805 	if (!skb)
806 		return -ENOBUFS;
807 
808 	if (old && old->handle) {
809 		if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
810 			goto err_out;
811 	}
812 	if (new) {
813 		if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
814 			goto err_out;
815 	}
816 
817 	if (skb->len)
818 		return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
819 
820 err_out:
821 	kfree_skb(skb);
822 	return -EINVAL;
823 }
824 
825 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
826 {
827 	int idx, q_idx;
828 	int s_idx, s_q_idx;
829 	struct net_device *dev;
830 	struct Qdisc *q;
831 
832 	s_idx = cb->args[0];
833 	s_q_idx = q_idx = cb->args[1];
834 	read_lock(&dev_base_lock);
835 	for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
836 		if (idx < s_idx)
837 			continue;
838 		if (idx > s_idx)
839 			s_q_idx = 0;
840 		read_lock_bh(&qdisc_tree_lock);
841 		q_idx = 0;
842 		list_for_each_entry(q, &dev->qdisc_list, list) {
843 			if (q_idx < s_q_idx) {
844 				q_idx++;
845 				continue;
846 			}
847 			if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
848 					  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
849 				read_unlock_bh(&qdisc_tree_lock);
850 				goto done;
851 			}
852 			q_idx++;
853 		}
854 		read_unlock_bh(&qdisc_tree_lock);
855 	}
856 
857 done:
858 	read_unlock(&dev_base_lock);
859 
860 	cb->args[0] = idx;
861 	cb->args[1] = q_idx;
862 
863 	return skb->len;
864 }
865 
866 
867 
868 /************************************************
869  *	Traffic classes manipulation.		*
870  ************************************************/
871 
872 
873 
874 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
875 {
876 	struct tcmsg *tcm = NLMSG_DATA(n);
877 	struct rtattr **tca = arg;
878 	struct net_device *dev;
879 	struct Qdisc *q = NULL;
880 	struct Qdisc_class_ops *cops;
881 	unsigned long cl = 0;
882 	unsigned long new_cl;
883 	u32 pid = tcm->tcm_parent;
884 	u32 clid = tcm->tcm_handle;
885 	u32 qid = TC_H_MAJ(clid);
886 	int err;
887 
888 	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
889 		return -ENODEV;
890 
891 	/*
892 	   parent == TC_H_UNSPEC - unspecified parent.
893 	   parent == TC_H_ROOT   - class is root, which has no parent.
894 	   parent == X:0	 - parent is root class.
895 	   parent == X:Y	 - parent is a node in hierarchy.
896 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
897 
898 	   handle == 0:0	 - generate handle from kernel pool.
899 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
900 	   handle == X:Y	 - clear.
901 	   handle == X:0	 - root class.
902 	 */
903 
904 	/* Step 1. Determine qdisc handle X:0 */
905 
906 	if (pid != TC_H_ROOT) {
907 		u32 qid1 = TC_H_MAJ(pid);
908 
909 		if (qid && qid1) {
910 			/* If both majors are known, they must be identical. */
911 			if (qid != qid1)
912 				return -EINVAL;
913 		} else if (qid1) {
914 			qid = qid1;
915 		} else if (qid == 0)
916 			qid = dev->qdisc_sleeping->handle;
917 
918 		/* Now qid is genuine qdisc handle consistent
919 		   both with parent and child.
920 
921 		   TC_H_MAJ(pid) still may be unspecified, complete it now.
922 		 */
923 		if (pid)
924 			pid = TC_H_MAKE(qid, pid);
925 	} else {
926 		if (qid == 0)
927 			qid = dev->qdisc_sleeping->handle;
928 	}
929 
930 	/* OK. Locate qdisc */
931 	if ((q = qdisc_lookup(dev, qid)) == NULL)
932 		return -ENOENT;
933 
934 	/* An check that it supports classes */
935 	cops = q->ops->cl_ops;
936 	if (cops == NULL)
937 		return -EINVAL;
938 
939 	/* Now try to get class */
940 	if (clid == 0) {
941 		if (pid == TC_H_ROOT)
942 			clid = qid;
943 	} else
944 		clid = TC_H_MAKE(qid, clid);
945 
946 	if (clid)
947 		cl = cops->get(q, clid);
948 
949 	if (cl == 0) {
950 		err = -ENOENT;
951 		if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
952 			goto out;
953 	} else {
954 		switch (n->nlmsg_type) {
955 		case RTM_NEWTCLASS:
956 			err = -EEXIST;
957 			if (n->nlmsg_flags&NLM_F_EXCL)
958 				goto out;
959 			break;
960 		case RTM_DELTCLASS:
961 			err = cops->delete(q, cl);
962 			if (err == 0)
963 				tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
964 			goto out;
965 		case RTM_GETTCLASS:
966 			err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
967 			goto out;
968 		default:
969 			err = -EINVAL;
970 			goto out;
971 		}
972 	}
973 
974 	new_cl = cl;
975 	err = cops->change(q, clid, pid, tca, &new_cl);
976 	if (err == 0)
977 		tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
978 
979 out:
980 	if (cl)
981 		cops->put(q, cl);
982 
983 	return err;
984 }
985 
986 
987 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
988 			  unsigned long cl,
989 			  u32 pid, u32 seq, u16 flags, int event)
990 {
991 	struct tcmsg *tcm;
992 	struct nlmsghdr  *nlh;
993 	unsigned char	 *b = skb->tail;
994 	struct gnet_dump d;
995 	struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
996 
997 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
998 	tcm = NLMSG_DATA(nlh);
999 	tcm->tcm_family = AF_UNSPEC;
1000 	tcm->tcm_ifindex = q->dev->ifindex;
1001 	tcm->tcm_parent = q->handle;
1002 	tcm->tcm_handle = q->handle;
1003 	tcm->tcm_info = 0;
1004 	RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
1005 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1006 		goto rtattr_failure;
1007 
1008 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
1009 			TCA_XSTATS, q->stats_lock, &d) < 0)
1010 		goto rtattr_failure;
1011 
1012 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1013 		goto rtattr_failure;
1014 
1015 	if (gnet_stats_finish_copy(&d) < 0)
1016 		goto rtattr_failure;
1017 
1018 	nlh->nlmsg_len = skb->tail - b;
1019 	return skb->len;
1020 
1021 nlmsg_failure:
1022 rtattr_failure:
1023 	skb_trim(skb, b - skb->data);
1024 	return -1;
1025 }
1026 
1027 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1028 			  struct Qdisc *q, unsigned long cl, int event)
1029 {
1030 	struct sk_buff *skb;
1031 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1032 
1033 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1034 	if (!skb)
1035 		return -ENOBUFS;
1036 
1037 	if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1038 		kfree_skb(skb);
1039 		return -EINVAL;
1040 	}
1041 
1042 	return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1043 }
1044 
1045 struct qdisc_dump_args
1046 {
1047 	struct qdisc_walker w;
1048 	struct sk_buff *skb;
1049 	struct netlink_callback *cb;
1050 };
1051 
1052 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1053 {
1054 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1055 
1056 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1057 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1058 }
1059 
1060 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1061 {
1062 	int t;
1063 	int s_t;
1064 	struct net_device *dev;
1065 	struct Qdisc *q;
1066 	struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1067 	struct qdisc_dump_args arg;
1068 
1069 	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1070 		return 0;
1071 	if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1072 		return 0;
1073 
1074 	s_t = cb->args[0];
1075 	t = 0;
1076 
1077 	read_lock_bh(&qdisc_tree_lock);
1078 	list_for_each_entry(q, &dev->qdisc_list, list) {
1079 		if (t < s_t || !q->ops->cl_ops ||
1080 		    (tcm->tcm_parent &&
1081 		     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1082 			t++;
1083 			continue;
1084 		}
1085 		if (t > s_t)
1086 			memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1087 		arg.w.fn = qdisc_class_dump;
1088 		arg.skb = skb;
1089 		arg.cb = cb;
1090 		arg.w.stop  = 0;
1091 		arg.w.skip = cb->args[1];
1092 		arg.w.count = 0;
1093 		q->ops->cl_ops->walk(q, &arg.w);
1094 		cb->args[1] = arg.w.count;
1095 		if (arg.w.stop)
1096 			break;
1097 		t++;
1098 	}
1099 	read_unlock_bh(&qdisc_tree_lock);
1100 
1101 	cb->args[0] = t;
1102 
1103 	dev_put(dev);
1104 	return skb->len;
1105 }
1106 
1107 /* Main classifier routine: scans classifier chain attached
1108    to this qdisc, (optionally) tests for protocol and asks
1109    specific classifiers.
1110  */
1111 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1112 	struct tcf_result *res)
1113 {
1114 	int err = 0;
1115 	u32 protocol = skb->protocol;
1116 #ifdef CONFIG_NET_CLS_ACT
1117 	struct tcf_proto *otp = tp;
1118 reclassify:
1119 #endif
1120 	protocol = skb->protocol;
1121 
1122 	for ( ; tp; tp = tp->next) {
1123 		if ((tp->protocol == protocol ||
1124 			tp->protocol == __constant_htons(ETH_P_ALL)) &&
1125 			(err = tp->classify(skb, tp, res)) >= 0) {
1126 #ifdef CONFIG_NET_CLS_ACT
1127 			if ( TC_ACT_RECLASSIFY == err) {
1128 				__u32 verd = (__u32) G_TC_VERD(skb->tc_verd);
1129 				tp = otp;
1130 
1131 				if (MAX_REC_LOOP < verd++) {
1132 					printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
1133 						tp->prio&0xffff, ntohs(tp->protocol));
1134 					return TC_ACT_SHOT;
1135 				}
1136 				skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd);
1137 				goto reclassify;
1138 			} else {
1139 				if (skb->tc_verd)
1140 					skb->tc_verd = SET_TC_VERD(skb->tc_verd,0);
1141 				return err;
1142 			}
1143 #else
1144 
1145 			return err;
1146 #endif
1147 		}
1148 
1149 	}
1150 	return -1;
1151 }
1152 
1153 static int psched_us_per_tick = 1;
1154 static int psched_tick_per_us = 1;
1155 
1156 #ifdef CONFIG_PROC_FS
1157 static int psched_show(struct seq_file *seq, void *v)
1158 {
1159 	seq_printf(seq, "%08x %08x %08x %08x\n",
1160 		      psched_tick_per_us, psched_us_per_tick,
1161 		      1000000, HZ);
1162 
1163 	return 0;
1164 }
1165 
1166 static int psched_open(struct inode *inode, struct file *file)
1167 {
1168 	return single_open(file, psched_show, PDE(inode)->data);
1169 }
1170 
1171 static struct file_operations psched_fops = {
1172 	.owner = THIS_MODULE,
1173 	.open = psched_open,
1174 	.read  = seq_read,
1175 	.llseek = seq_lseek,
1176 	.release = single_release,
1177 };
1178 #endif
1179 
1180 #ifdef CONFIG_NET_SCH_CLK_CPU
1181 psched_tdiff_t psched_clock_per_hz;
1182 int psched_clock_scale;
1183 EXPORT_SYMBOL(psched_clock_per_hz);
1184 EXPORT_SYMBOL(psched_clock_scale);
1185 
1186 psched_time_t psched_time_base;
1187 cycles_t psched_time_mark;
1188 EXPORT_SYMBOL(psched_time_mark);
1189 EXPORT_SYMBOL(psched_time_base);
1190 
1191 /*
1192  * Periodically adjust psched_time_base to avoid overflow
1193  * with 32-bit get_cycles(). Safe up to 4GHz CPU.
1194  */
1195 static void psched_tick(unsigned long);
1196 static DEFINE_TIMER(psched_timer, psched_tick, 0, 0);
1197 
1198 static void psched_tick(unsigned long dummy)
1199 {
1200 	if (sizeof(cycles_t) == sizeof(u32)) {
1201 		psched_time_t dummy_stamp;
1202 		PSCHED_GET_TIME(dummy_stamp);
1203 		psched_timer.expires = jiffies + 1*HZ;
1204 		add_timer(&psched_timer);
1205 	}
1206 }
1207 
1208 int __init psched_calibrate_clock(void)
1209 {
1210 	psched_time_t stamp, stamp1;
1211 	struct timeval tv, tv1;
1212 	psched_tdiff_t delay;
1213 	long rdelay;
1214 	unsigned long stop;
1215 
1216 	psched_tick(0);
1217 	stop = jiffies + HZ/10;
1218 	PSCHED_GET_TIME(stamp);
1219 	do_gettimeofday(&tv);
1220 	while (time_before(jiffies, stop)) {
1221 		barrier();
1222 		cpu_relax();
1223 	}
1224 	PSCHED_GET_TIME(stamp1);
1225 	do_gettimeofday(&tv1);
1226 
1227 	delay = PSCHED_TDIFF(stamp1, stamp);
1228 	rdelay = tv1.tv_usec - tv.tv_usec;
1229 	rdelay += (tv1.tv_sec - tv.tv_sec)*1000000;
1230 	if (rdelay > delay)
1231 		return -1;
1232 	delay /= rdelay;
1233 	psched_tick_per_us = delay;
1234 	while ((delay>>=1) != 0)
1235 		psched_clock_scale++;
1236 	psched_us_per_tick = 1<<psched_clock_scale;
1237 	psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
1238 	return 0;
1239 }
1240 #endif
1241 
1242 static int __init pktsched_init(void)
1243 {
1244 	struct rtnetlink_link *link_p;
1245 
1246 #ifdef CONFIG_NET_SCH_CLK_CPU
1247 	if (psched_calibrate_clock() < 0)
1248 		return -1;
1249 #elif defined(CONFIG_NET_SCH_CLK_JIFFIES)
1250 	psched_tick_per_us = HZ<<PSCHED_JSCALE;
1251 	psched_us_per_tick = 1000000;
1252 #endif
1253 
1254 	link_p = rtnetlink_links[PF_UNSPEC];
1255 
1256 	/* Setup rtnetlink links. It is made here to avoid
1257 	   exporting large number of public symbols.
1258 	 */
1259 
1260 	if (link_p) {
1261 		link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
1262 		link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
1263 		link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
1264 		link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
1265 		link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1266 		link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1267 		link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1268 		link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
1269 	}
1270 
1271 	register_qdisc(&pfifo_qdisc_ops);
1272 	register_qdisc(&bfifo_qdisc_ops);
1273 	proc_net_fops_create("psched", 0, &psched_fops);
1274 
1275 	return 0;
1276 }
1277 
1278 subsys_initcall(pktsched_init);
1279 
1280 EXPORT_SYMBOL(qdisc_lookup);
1281 EXPORT_SYMBOL(qdisc_get_rtab);
1282 EXPORT_SYMBOL(qdisc_put_rtab);
1283 EXPORT_SYMBOL(register_qdisc);
1284 EXPORT_SYMBOL(unregister_qdisc);
1285 EXPORT_SYMBOL(tc_classify);
1286