xref: /linux/net/sched/sch_api.c (revision 60b2737de1b1ddfdb90f3ba622634eb49d6f3603)
1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/config.h>
19 #include <linux/module.h>
20 #include <linux/types.h>
21 #include <linux/kernel.h>
22 #include <linux/sched.h>
23 #include <linux/string.h>
24 #include <linux/mm.h>
25 #include <linux/socket.h>
26 #include <linux/sockios.h>
27 #include <linux/in.h>
28 #include <linux/errno.h>
29 #include <linux/interrupt.h>
30 #include <linux/netdevice.h>
31 #include <linux/skbuff.h>
32 #include <linux/rtnetlink.h>
33 #include <linux/init.h>
34 #include <linux/proc_fs.h>
35 #include <linux/seq_file.h>
36 #include <linux/kmod.h>
37 #include <linux/list.h>
38 #include <linux/bitops.h>
39 
40 #include <net/sock.h>
41 #include <net/pkt_sched.h>
42 
43 #include <asm/processor.h>
44 #include <asm/uaccess.h>
45 #include <asm/system.h>
46 
47 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
48 			struct Qdisc *old, struct Qdisc *new);
49 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
50 			 struct Qdisc *q, unsigned long cl, int event);
51 
52 /*
53 
54    Short review.
55    -------------
56 
57    This file consists of two interrelated parts:
58 
59    1. queueing disciplines manager frontend.
60    2. traffic classes manager frontend.
61 
62    Generally, queueing discipline ("qdisc") is a black box,
63    which is able to enqueue packets and to dequeue them (when
64    device is ready to send something) in order and at times
65    determined by algorithm hidden in it.
66 
67    qdisc's are divided to two categories:
68    - "queues", which have no internal structure visible from outside.
69    - "schedulers", which split all the packets to "traffic classes",
70      using "packet classifiers" (look at cls_api.c)
71 
72    In turn, classes may have child qdiscs (as rule, queues)
73    attached to them etc. etc. etc.
74 
75    The goal of the routines in this file is to translate
76    information supplied by user in the form of handles
77    to more intelligible for kernel form, to make some sanity
78    checks and part of work, which is common to all qdiscs
79    and to provide rtnetlink notifications.
80 
81    All real intelligent work is done inside qdisc modules.
82 
83 
84 
85    Every discipline has two major routines: enqueue and dequeue.
86 
87    ---dequeue
88 
89    dequeue usually returns a skb to send. It is allowed to return NULL,
90    but it does not mean that queue is empty, it just means that
91    discipline does not want to send anything this time.
92    Queue is really empty if q->q.qlen == 0.
93    For complicated disciplines with multiple queues q->q is not
94    real packet queue, but however q->q.qlen must be valid.
95 
96    ---enqueue
97 
98    enqueue returns 0, if packet was enqueued successfully.
99    If packet (this one or another one) was dropped, it returns
100    not zero error code.
101    NET_XMIT_DROP 	- this packet dropped
102      Expected action: do not backoff, but wait until queue will clear.
103    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
104      Expected action: backoff or ignore
105    NET_XMIT_POLICED	- dropped by police.
106      Expected action: backoff or error to real-time apps.
107 
108    Auxiliary routines:
109 
110    ---requeue
111 
112    requeues once dequeued packet. It is used for non-standard or
113    just buggy devices, which can defer output even if dev->tbusy=0.
114 
115    ---reset
116 
117    returns qdisc to initial state: purge all buffers, clear all
118    timers, counters (except for statistics) etc.
119 
120    ---init
121 
122    initializes newly created qdisc.
123 
124    ---destroy
125 
126    destroys resources allocated by init and during lifetime of qdisc.
127 
128    ---change
129 
130    changes qdisc parameters.
131  */
132 
133 /* Protects list of registered TC modules. It is pure SMP lock. */
134 static DEFINE_RWLOCK(qdisc_mod_lock);
135 
136 
137 /************************************************
138  *	Queueing disciplines manipulation.	*
139  ************************************************/
140 
141 
142 /* The list of all installed queueing disciplines. */
143 
144 static struct Qdisc_ops *qdisc_base;
145 
146 /* Register/uregister queueing discipline */
147 
148 int register_qdisc(struct Qdisc_ops *qops)
149 {
150 	struct Qdisc_ops *q, **qp;
151 	int rc = -EEXIST;
152 
153 	write_lock(&qdisc_mod_lock);
154 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
155 		if (!strcmp(qops->id, q->id))
156 			goto out;
157 
158 	if (qops->enqueue == NULL)
159 		qops->enqueue = noop_qdisc_ops.enqueue;
160 	if (qops->requeue == NULL)
161 		qops->requeue = noop_qdisc_ops.requeue;
162 	if (qops->dequeue == NULL)
163 		qops->dequeue = noop_qdisc_ops.dequeue;
164 
165 	qops->next = NULL;
166 	*qp = qops;
167 	rc = 0;
168 out:
169 	write_unlock(&qdisc_mod_lock);
170 	return rc;
171 }
172 
173 int unregister_qdisc(struct Qdisc_ops *qops)
174 {
175 	struct Qdisc_ops *q, **qp;
176 	int err = -ENOENT;
177 
178 	write_lock(&qdisc_mod_lock);
179 	for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
180 		if (q == qops)
181 			break;
182 	if (q) {
183 		*qp = q->next;
184 		q->next = NULL;
185 		err = 0;
186 	}
187 	write_unlock(&qdisc_mod_lock);
188 	return err;
189 }
190 
191 /* We know handle. Find qdisc among all qdisc's attached to device
192    (root qdisc, all its children, children of children etc.)
193  */
194 
195 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
196 {
197 	struct Qdisc *q;
198 
199 	read_lock_bh(&qdisc_tree_lock);
200 	list_for_each_entry(q, &dev->qdisc_list, list) {
201 		if (q->handle == handle) {
202 			read_unlock_bh(&qdisc_tree_lock);
203 			return q;
204 		}
205 	}
206 	read_unlock_bh(&qdisc_tree_lock);
207 	return NULL;
208 }
209 
210 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
211 {
212 	unsigned long cl;
213 	struct Qdisc *leaf;
214 	struct Qdisc_class_ops *cops = p->ops->cl_ops;
215 
216 	if (cops == NULL)
217 		return NULL;
218 	cl = cops->get(p, classid);
219 
220 	if (cl == 0)
221 		return NULL;
222 	leaf = cops->leaf(p, cl);
223 	cops->put(p, cl);
224 	return leaf;
225 }
226 
227 /* Find queueing discipline by name */
228 
229 static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
230 {
231 	struct Qdisc_ops *q = NULL;
232 
233 	if (kind) {
234 		read_lock(&qdisc_mod_lock);
235 		for (q = qdisc_base; q; q = q->next) {
236 			if (rtattr_strcmp(kind, q->id) == 0) {
237 				if (!try_module_get(q->owner))
238 					q = NULL;
239 				break;
240 			}
241 		}
242 		read_unlock(&qdisc_mod_lock);
243 	}
244 	return q;
245 }
246 
247 static struct qdisc_rate_table *qdisc_rtab_list;
248 
249 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
250 {
251 	struct qdisc_rate_table *rtab;
252 
253 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
254 		if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
255 			rtab->refcnt++;
256 			return rtab;
257 		}
258 	}
259 
260 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
261 		return NULL;
262 
263 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
264 	if (rtab) {
265 		rtab->rate = *r;
266 		rtab->refcnt = 1;
267 		memcpy(rtab->data, RTA_DATA(tab), 1024);
268 		rtab->next = qdisc_rtab_list;
269 		qdisc_rtab_list = rtab;
270 	}
271 	return rtab;
272 }
273 
274 void qdisc_put_rtab(struct qdisc_rate_table *tab)
275 {
276 	struct qdisc_rate_table *rtab, **rtabp;
277 
278 	if (!tab || --tab->refcnt)
279 		return;
280 
281 	for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
282 		if (rtab == tab) {
283 			*rtabp = rtab->next;
284 			kfree(rtab);
285 			return;
286 		}
287 	}
288 }
289 
290 
291 /* Allocate an unique handle from space managed by kernel */
292 
293 static u32 qdisc_alloc_handle(struct net_device *dev)
294 {
295 	int i = 0x10000;
296 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
297 
298 	do {
299 		autohandle += TC_H_MAKE(0x10000U, 0);
300 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
301 			autohandle = TC_H_MAKE(0x80000000U, 0);
302 	} while	(qdisc_lookup(dev, autohandle) && --i > 0);
303 
304 	return i>0 ? autohandle : 0;
305 }
306 
307 /* Attach toplevel qdisc to device dev */
308 
309 static struct Qdisc *
310 dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
311 {
312 	struct Qdisc *oqdisc;
313 
314 	if (dev->flags & IFF_UP)
315 		dev_deactivate(dev);
316 
317 	qdisc_lock_tree(dev);
318 	if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
319 		oqdisc = dev->qdisc_ingress;
320 		/* Prune old scheduler */
321 		if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
322 			/* delete */
323 			qdisc_reset(oqdisc);
324 			dev->qdisc_ingress = NULL;
325 		} else {  /* new */
326 			dev->qdisc_ingress = qdisc;
327 		}
328 
329 	} else {
330 
331 		oqdisc = dev->qdisc_sleeping;
332 
333 		/* Prune old scheduler */
334 		if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
335 			qdisc_reset(oqdisc);
336 
337 		/* ... and graft new one */
338 		if (qdisc == NULL)
339 			qdisc = &noop_qdisc;
340 		dev->qdisc_sleeping = qdisc;
341 		dev->qdisc = &noop_qdisc;
342 	}
343 
344 	qdisc_unlock_tree(dev);
345 
346 	if (dev->flags & IFF_UP)
347 		dev_activate(dev);
348 
349 	return oqdisc;
350 }
351 
352 
353 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
354    to device "dev".
355 
356    Old qdisc is not destroyed but returned in *old.
357  */
358 
359 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
360 		       u32 classid,
361 		       struct Qdisc *new, struct Qdisc **old)
362 {
363 	int err = 0;
364 	struct Qdisc *q = *old;
365 
366 
367 	if (parent == NULL) {
368 		if (q && q->flags&TCQ_F_INGRESS) {
369 			*old = dev_graft_qdisc(dev, q);
370 		} else {
371 			*old = dev_graft_qdisc(dev, new);
372 		}
373 	} else {
374 		struct Qdisc_class_ops *cops = parent->ops->cl_ops;
375 
376 		err = -EINVAL;
377 
378 		if (cops) {
379 			unsigned long cl = cops->get(parent, classid);
380 			if (cl) {
381 				err = cops->graft(parent, cl, new, old);
382 				if (new)
383 					new->parent = classid;
384 				cops->put(parent, cl);
385 			}
386 		}
387 	}
388 	return err;
389 }
390 
391 /*
392    Allocate and initialize new qdisc.
393 
394    Parameters are passed via opt.
395  */
396 
397 static struct Qdisc *
398 qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
399 {
400 	int err;
401 	struct rtattr *kind = tca[TCA_KIND-1];
402 	void *p = NULL;
403 	struct Qdisc *sch;
404 	struct Qdisc_ops *ops;
405 	int size;
406 
407 	ops = qdisc_lookup_ops(kind);
408 #ifdef CONFIG_KMOD
409 	if (ops == NULL && kind != NULL) {
410 		char name[IFNAMSIZ];
411 		if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
412 			/* We dropped the RTNL semaphore in order to
413 			 * perform the module load.  So, even if we
414 			 * succeeded in loading the module we have to
415 			 * tell the caller to replay the request.  We
416 			 * indicate this using -EAGAIN.
417 			 * We replay the request because the device may
418 			 * go away in the mean time.
419 			 */
420 			rtnl_unlock();
421 			request_module("sch_%s", name);
422 			rtnl_lock();
423 			ops = qdisc_lookup_ops(kind);
424 			if (ops != NULL) {
425 				/* We will try again qdisc_lookup_ops,
426 				 * so don't keep a reference.
427 				 */
428 				module_put(ops->owner);
429 				err = -EAGAIN;
430 				goto err_out;
431 			}
432 		}
433 	}
434 #endif
435 
436 	err = -EINVAL;
437 	if (ops == NULL)
438 		goto err_out;
439 
440 	/* ensure that the Qdisc and the private data are 32-byte aligned */
441 	size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST);
442 	size += ops->priv_size + QDISC_ALIGN_CONST;
443 
444 	p = kmalloc(size, GFP_KERNEL);
445 	err = -ENOBUFS;
446 	if (!p)
447 		goto err_out2;
448 	memset(p, 0, size);
449 	sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST)
450 	                       & ~QDISC_ALIGN_CONST);
451 	sch->padded = (char *)sch - (char *)p;
452 
453 	INIT_LIST_HEAD(&sch->list);
454 	skb_queue_head_init(&sch->q);
455 
456 	if (handle == TC_H_INGRESS)
457 		sch->flags |= TCQ_F_INGRESS;
458 
459 	sch->ops = ops;
460 	sch->enqueue = ops->enqueue;
461 	sch->dequeue = ops->dequeue;
462 	sch->dev = dev;
463 	dev_hold(dev);
464 	atomic_set(&sch->refcnt, 1);
465 	sch->stats_lock = &dev->queue_lock;
466 	if (handle == 0) {
467 		handle = qdisc_alloc_handle(dev);
468 		err = -ENOMEM;
469 		if (handle == 0)
470 			goto err_out3;
471 	}
472 
473 	if (handle == TC_H_INGRESS)
474                 sch->handle =TC_H_MAKE(TC_H_INGRESS, 0);
475         else
476                 sch->handle = handle;
477 
478 	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
479 		qdisc_lock_tree(dev);
480 		list_add_tail(&sch->list, &dev->qdisc_list);
481 		qdisc_unlock_tree(dev);
482 
483 #ifdef CONFIG_NET_ESTIMATOR
484 		if (tca[TCA_RATE-1])
485 			gen_new_estimator(&sch->bstats, &sch->rate_est,
486 				sch->stats_lock, tca[TCA_RATE-1]);
487 #endif
488 		return sch;
489 	}
490 err_out3:
491 	dev_put(dev);
492 err_out2:
493 	module_put(ops->owner);
494 err_out:
495 	*errp = err;
496 	if (p)
497 		kfree(p);
498 	return NULL;
499 }
500 
501 static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
502 {
503 	if (tca[TCA_OPTIONS-1]) {
504 		int err;
505 
506 		if (sch->ops->change == NULL)
507 			return -EINVAL;
508 		err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
509 		if (err)
510 			return err;
511 	}
512 #ifdef CONFIG_NET_ESTIMATOR
513 	if (tca[TCA_RATE-1])
514 		gen_replace_estimator(&sch->bstats, &sch->rate_est,
515 			sch->stats_lock, tca[TCA_RATE-1]);
516 #endif
517 	return 0;
518 }
519 
520 struct check_loop_arg
521 {
522 	struct qdisc_walker 	w;
523 	struct Qdisc		*p;
524 	int			depth;
525 };
526 
527 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
528 
529 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
530 {
531 	struct check_loop_arg	arg;
532 
533 	if (q->ops->cl_ops == NULL)
534 		return 0;
535 
536 	arg.w.stop = arg.w.skip = arg.w.count = 0;
537 	arg.w.fn = check_loop_fn;
538 	arg.depth = depth;
539 	arg.p = p;
540 	q->ops->cl_ops->walk(q, &arg.w);
541 	return arg.w.stop ? -ELOOP : 0;
542 }
543 
544 static int
545 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
546 {
547 	struct Qdisc *leaf;
548 	struct Qdisc_class_ops *cops = q->ops->cl_ops;
549 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
550 
551 	leaf = cops->leaf(q, cl);
552 	if (leaf) {
553 		if (leaf == arg->p || arg->depth > 7)
554 			return -ELOOP;
555 		return check_loop(leaf, arg->p, arg->depth + 1);
556 	}
557 	return 0;
558 }
559 
560 /*
561  * Delete/get qdisc.
562  */
563 
564 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
565 {
566 	struct tcmsg *tcm = NLMSG_DATA(n);
567 	struct rtattr **tca = arg;
568 	struct net_device *dev;
569 	u32 clid = tcm->tcm_parent;
570 	struct Qdisc *q = NULL;
571 	struct Qdisc *p = NULL;
572 	int err;
573 
574 	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
575 		return -ENODEV;
576 
577 	if (clid) {
578 		if (clid != TC_H_ROOT) {
579 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
580 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
581 					return -ENOENT;
582 				q = qdisc_leaf(p, clid);
583 			} else { /* ingress */
584 				q = dev->qdisc_ingress;
585                         }
586 		} else {
587 			q = dev->qdisc_sleeping;
588 		}
589 		if (!q)
590 			return -ENOENT;
591 
592 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
593 			return -EINVAL;
594 	} else {
595 		if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
596 			return -ENOENT;
597 	}
598 
599 	if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
600 		return -EINVAL;
601 
602 	if (n->nlmsg_type == RTM_DELQDISC) {
603 		if (!clid)
604 			return -EINVAL;
605 		if (q->handle == 0)
606 			return -ENOENT;
607 		if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
608 			return err;
609 		if (q) {
610 			qdisc_notify(skb, n, clid, q, NULL);
611 			spin_lock_bh(&dev->queue_lock);
612 			qdisc_destroy(q);
613 			spin_unlock_bh(&dev->queue_lock);
614 		}
615 	} else {
616 		qdisc_notify(skb, n, clid, NULL, q);
617 	}
618 	return 0;
619 }
620 
621 /*
622    Create/change qdisc.
623  */
624 
625 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
626 {
627 	struct tcmsg *tcm;
628 	struct rtattr **tca;
629 	struct net_device *dev;
630 	u32 clid;
631 	struct Qdisc *q, *p;
632 	int err;
633 
634 replay:
635 	/* Reinit, just in case something touches this. */
636 	tcm = NLMSG_DATA(n);
637 	tca = arg;
638 	clid = tcm->tcm_parent;
639 	q = p = NULL;
640 
641 	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
642 		return -ENODEV;
643 
644 	if (clid) {
645 		if (clid != TC_H_ROOT) {
646 			if (clid != TC_H_INGRESS) {
647 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
648 					return -ENOENT;
649 				q = qdisc_leaf(p, clid);
650 			} else { /*ingress */
651 				q = dev->qdisc_ingress;
652 			}
653 		} else {
654 			q = dev->qdisc_sleeping;
655 		}
656 
657 		/* It may be default qdisc, ignore it */
658 		if (q && q->handle == 0)
659 			q = NULL;
660 
661 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
662 			if (tcm->tcm_handle) {
663 				if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
664 					return -EEXIST;
665 				if (TC_H_MIN(tcm->tcm_handle))
666 					return -EINVAL;
667 				if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
668 					goto create_n_graft;
669 				if (n->nlmsg_flags&NLM_F_EXCL)
670 					return -EEXIST;
671 				if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
672 					return -EINVAL;
673 				if (q == p ||
674 				    (p && check_loop(q, p, 0)))
675 					return -ELOOP;
676 				atomic_inc(&q->refcnt);
677 				goto graft;
678 			} else {
679 				if (q == NULL)
680 					goto create_n_graft;
681 
682 				/* This magic test requires explanation.
683 				 *
684 				 *   We know, that some child q is already
685 				 *   attached to this parent and have choice:
686 				 *   either to change it or to create/graft new one.
687 				 *
688 				 *   1. We are allowed to create/graft only
689 				 *   if CREATE and REPLACE flags are set.
690 				 *
691 				 *   2. If EXCL is set, requestor wanted to say,
692 				 *   that qdisc tcm_handle is not expected
693 				 *   to exist, so that we choose create/graft too.
694 				 *
695 				 *   3. The last case is when no flags are set.
696 				 *   Alas, it is sort of hole in API, we
697 				 *   cannot decide what to do unambiguously.
698 				 *   For now we select create/graft, if
699 				 *   user gave KIND, which does not match existing.
700 				 */
701 				if ((n->nlmsg_flags&NLM_F_CREATE) &&
702 				    (n->nlmsg_flags&NLM_F_REPLACE) &&
703 				    ((n->nlmsg_flags&NLM_F_EXCL) ||
704 				     (tca[TCA_KIND-1] &&
705 				      rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
706 					goto create_n_graft;
707 			}
708 		}
709 	} else {
710 		if (!tcm->tcm_handle)
711 			return -EINVAL;
712 		q = qdisc_lookup(dev, tcm->tcm_handle);
713 	}
714 
715 	/* Change qdisc parameters */
716 	if (q == NULL)
717 		return -ENOENT;
718 	if (n->nlmsg_flags&NLM_F_EXCL)
719 		return -EEXIST;
720 	if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
721 		return -EINVAL;
722 	err = qdisc_change(q, tca);
723 	if (err == 0)
724 		qdisc_notify(skb, n, clid, NULL, q);
725 	return err;
726 
727 create_n_graft:
728 	if (!(n->nlmsg_flags&NLM_F_CREATE))
729 		return -ENOENT;
730 	if (clid == TC_H_INGRESS)
731 		q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
732         else
733 		q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
734 	if (q == NULL) {
735 		if (err == -EAGAIN)
736 			goto replay;
737 		return err;
738 	}
739 
740 graft:
741 	if (1) {
742 		struct Qdisc *old_q = NULL;
743 		err = qdisc_graft(dev, p, clid, q, &old_q);
744 		if (err) {
745 			if (q) {
746 				spin_lock_bh(&dev->queue_lock);
747 				qdisc_destroy(q);
748 				spin_unlock_bh(&dev->queue_lock);
749 			}
750 			return err;
751 		}
752 		qdisc_notify(skb, n, clid, old_q, q);
753 		if (old_q) {
754 			spin_lock_bh(&dev->queue_lock);
755 			qdisc_destroy(old_q);
756 			spin_unlock_bh(&dev->queue_lock);
757 		}
758 	}
759 	return 0;
760 }
761 
762 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
763 			 u32 pid, u32 seq, u16 flags, int event)
764 {
765 	struct tcmsg *tcm;
766 	struct nlmsghdr  *nlh;
767 	unsigned char	 *b = skb->tail;
768 	struct gnet_dump d;
769 
770 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
771 	tcm = NLMSG_DATA(nlh);
772 	tcm->tcm_family = AF_UNSPEC;
773 	tcm->tcm_ifindex = q->dev->ifindex;
774 	tcm->tcm_parent = clid;
775 	tcm->tcm_handle = q->handle;
776 	tcm->tcm_info = atomic_read(&q->refcnt);
777 	RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
778 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
779 		goto rtattr_failure;
780 	q->qstats.qlen = q->q.qlen;
781 
782 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
783 			TCA_XSTATS, q->stats_lock, &d) < 0)
784 		goto rtattr_failure;
785 
786 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
787 		goto rtattr_failure;
788 
789 	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
790 #ifdef CONFIG_NET_ESTIMATOR
791 	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
792 #endif
793 	    gnet_stats_copy_queue(&d, &q->qstats) < 0)
794 		goto rtattr_failure;
795 
796 	if (gnet_stats_finish_copy(&d) < 0)
797 		goto rtattr_failure;
798 
799 	nlh->nlmsg_len = skb->tail - b;
800 	return skb->len;
801 
802 nlmsg_failure:
803 rtattr_failure:
804 	skb_trim(skb, b - skb->data);
805 	return -1;
806 }
807 
808 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
809 			u32 clid, struct Qdisc *old, struct Qdisc *new)
810 {
811 	struct sk_buff *skb;
812 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
813 
814 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
815 	if (!skb)
816 		return -ENOBUFS;
817 
818 	if (old && old->handle) {
819 		if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
820 			goto err_out;
821 	}
822 	if (new) {
823 		if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
824 			goto err_out;
825 	}
826 
827 	if (skb->len)
828 		return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
829 
830 err_out:
831 	kfree_skb(skb);
832 	return -EINVAL;
833 }
834 
835 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
836 {
837 	int idx, q_idx;
838 	int s_idx, s_q_idx;
839 	struct net_device *dev;
840 	struct Qdisc *q;
841 
842 	s_idx = cb->args[0];
843 	s_q_idx = q_idx = cb->args[1];
844 	read_lock(&dev_base_lock);
845 	for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
846 		if (idx < s_idx)
847 			continue;
848 		if (idx > s_idx)
849 			s_q_idx = 0;
850 		read_lock_bh(&qdisc_tree_lock);
851 		q_idx = 0;
852 		list_for_each_entry(q, &dev->qdisc_list, list) {
853 			if (q_idx < s_q_idx) {
854 				q_idx++;
855 				continue;
856 			}
857 			if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
858 					  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
859 				read_unlock_bh(&qdisc_tree_lock);
860 				goto done;
861 			}
862 			q_idx++;
863 		}
864 		read_unlock_bh(&qdisc_tree_lock);
865 	}
866 
867 done:
868 	read_unlock(&dev_base_lock);
869 
870 	cb->args[0] = idx;
871 	cb->args[1] = q_idx;
872 
873 	return skb->len;
874 }
875 
876 
877 
878 /************************************************
879  *	Traffic classes manipulation.		*
880  ************************************************/
881 
882 
883 
884 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
885 {
886 	struct tcmsg *tcm = NLMSG_DATA(n);
887 	struct rtattr **tca = arg;
888 	struct net_device *dev;
889 	struct Qdisc *q = NULL;
890 	struct Qdisc_class_ops *cops;
891 	unsigned long cl = 0;
892 	unsigned long new_cl;
893 	u32 pid = tcm->tcm_parent;
894 	u32 clid = tcm->tcm_handle;
895 	u32 qid = TC_H_MAJ(clid);
896 	int err;
897 
898 	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
899 		return -ENODEV;
900 
901 	/*
902 	   parent == TC_H_UNSPEC - unspecified parent.
903 	   parent == TC_H_ROOT   - class is root, which has no parent.
904 	   parent == X:0	 - parent is root class.
905 	   parent == X:Y	 - parent is a node in hierarchy.
906 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
907 
908 	   handle == 0:0	 - generate handle from kernel pool.
909 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
910 	   handle == X:Y	 - clear.
911 	   handle == X:0	 - root class.
912 	 */
913 
914 	/* Step 1. Determine qdisc handle X:0 */
915 
916 	if (pid != TC_H_ROOT) {
917 		u32 qid1 = TC_H_MAJ(pid);
918 
919 		if (qid && qid1) {
920 			/* If both majors are known, they must be identical. */
921 			if (qid != qid1)
922 				return -EINVAL;
923 		} else if (qid1) {
924 			qid = qid1;
925 		} else if (qid == 0)
926 			qid = dev->qdisc_sleeping->handle;
927 
928 		/* Now qid is genuine qdisc handle consistent
929 		   both with parent and child.
930 
931 		   TC_H_MAJ(pid) still may be unspecified, complete it now.
932 		 */
933 		if (pid)
934 			pid = TC_H_MAKE(qid, pid);
935 	} else {
936 		if (qid == 0)
937 			qid = dev->qdisc_sleeping->handle;
938 	}
939 
940 	/* OK. Locate qdisc */
941 	if ((q = qdisc_lookup(dev, qid)) == NULL)
942 		return -ENOENT;
943 
944 	/* An check that it supports classes */
945 	cops = q->ops->cl_ops;
946 	if (cops == NULL)
947 		return -EINVAL;
948 
949 	/* Now try to get class */
950 	if (clid == 0) {
951 		if (pid == TC_H_ROOT)
952 			clid = qid;
953 	} else
954 		clid = TC_H_MAKE(qid, clid);
955 
956 	if (clid)
957 		cl = cops->get(q, clid);
958 
959 	if (cl == 0) {
960 		err = -ENOENT;
961 		if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
962 			goto out;
963 	} else {
964 		switch (n->nlmsg_type) {
965 		case RTM_NEWTCLASS:
966 			err = -EEXIST;
967 			if (n->nlmsg_flags&NLM_F_EXCL)
968 				goto out;
969 			break;
970 		case RTM_DELTCLASS:
971 			err = cops->delete(q, cl);
972 			if (err == 0)
973 				tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
974 			goto out;
975 		case RTM_GETTCLASS:
976 			err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
977 			goto out;
978 		default:
979 			err = -EINVAL;
980 			goto out;
981 		}
982 	}
983 
984 	new_cl = cl;
985 	err = cops->change(q, clid, pid, tca, &new_cl);
986 	if (err == 0)
987 		tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
988 
989 out:
990 	if (cl)
991 		cops->put(q, cl);
992 
993 	return err;
994 }
995 
996 
997 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
998 			  unsigned long cl,
999 			  u32 pid, u32 seq, u16 flags, int event)
1000 {
1001 	struct tcmsg *tcm;
1002 	struct nlmsghdr  *nlh;
1003 	unsigned char	 *b = skb->tail;
1004 	struct gnet_dump d;
1005 	struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1006 
1007 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1008 	tcm = NLMSG_DATA(nlh);
1009 	tcm->tcm_family = AF_UNSPEC;
1010 	tcm->tcm_ifindex = q->dev->ifindex;
1011 	tcm->tcm_parent = q->handle;
1012 	tcm->tcm_handle = q->handle;
1013 	tcm->tcm_info = 0;
1014 	RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
1015 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1016 		goto rtattr_failure;
1017 
1018 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
1019 			TCA_XSTATS, q->stats_lock, &d) < 0)
1020 		goto rtattr_failure;
1021 
1022 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1023 		goto rtattr_failure;
1024 
1025 	if (gnet_stats_finish_copy(&d) < 0)
1026 		goto rtattr_failure;
1027 
1028 	nlh->nlmsg_len = skb->tail - b;
1029 	return skb->len;
1030 
1031 nlmsg_failure:
1032 rtattr_failure:
1033 	skb_trim(skb, b - skb->data);
1034 	return -1;
1035 }
1036 
1037 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1038 			  struct Qdisc *q, unsigned long cl, int event)
1039 {
1040 	struct sk_buff *skb;
1041 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1042 
1043 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1044 	if (!skb)
1045 		return -ENOBUFS;
1046 
1047 	if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1048 		kfree_skb(skb);
1049 		return -EINVAL;
1050 	}
1051 
1052 	return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1053 }
1054 
1055 struct qdisc_dump_args
1056 {
1057 	struct qdisc_walker w;
1058 	struct sk_buff *skb;
1059 	struct netlink_callback *cb;
1060 };
1061 
1062 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1063 {
1064 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1065 
1066 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1067 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1068 }
1069 
1070 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1071 {
1072 	int t;
1073 	int s_t;
1074 	struct net_device *dev;
1075 	struct Qdisc *q;
1076 	struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1077 	struct qdisc_dump_args arg;
1078 
1079 	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1080 		return 0;
1081 	if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1082 		return 0;
1083 
1084 	s_t = cb->args[0];
1085 	t = 0;
1086 
1087 	read_lock_bh(&qdisc_tree_lock);
1088 	list_for_each_entry(q, &dev->qdisc_list, list) {
1089 		if (t < s_t || !q->ops->cl_ops ||
1090 		    (tcm->tcm_parent &&
1091 		     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1092 			t++;
1093 			continue;
1094 		}
1095 		if (t > s_t)
1096 			memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1097 		arg.w.fn = qdisc_class_dump;
1098 		arg.skb = skb;
1099 		arg.cb = cb;
1100 		arg.w.stop  = 0;
1101 		arg.w.skip = cb->args[1];
1102 		arg.w.count = 0;
1103 		q->ops->cl_ops->walk(q, &arg.w);
1104 		cb->args[1] = arg.w.count;
1105 		if (arg.w.stop)
1106 			break;
1107 		t++;
1108 	}
1109 	read_unlock_bh(&qdisc_tree_lock);
1110 
1111 	cb->args[0] = t;
1112 
1113 	dev_put(dev);
1114 	return skb->len;
1115 }
1116 
1117 /* Main classifier routine: scans classifier chain attached
1118    to this qdisc, (optionally) tests for protocol and asks
1119    specific classifiers.
1120  */
1121 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1122 	struct tcf_result *res)
1123 {
1124 	int err = 0;
1125 	u32 protocol = skb->protocol;
1126 #ifdef CONFIG_NET_CLS_ACT
1127 	struct tcf_proto *otp = tp;
1128 reclassify:
1129 #endif
1130 	protocol = skb->protocol;
1131 
1132 	for ( ; tp; tp = tp->next) {
1133 		if ((tp->protocol == protocol ||
1134 			tp->protocol == __constant_htons(ETH_P_ALL)) &&
1135 			(err = tp->classify(skb, tp, res)) >= 0) {
1136 #ifdef CONFIG_NET_CLS_ACT
1137 			if ( TC_ACT_RECLASSIFY == err) {
1138 				__u32 verd = (__u32) G_TC_VERD(skb->tc_verd);
1139 				tp = otp;
1140 
1141 				if (MAX_REC_LOOP < verd++) {
1142 					printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
1143 						tp->prio&0xffff, ntohs(tp->protocol));
1144 					return TC_ACT_SHOT;
1145 				}
1146 				skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd);
1147 				goto reclassify;
1148 			} else {
1149 				if (skb->tc_verd)
1150 					skb->tc_verd = SET_TC_VERD(skb->tc_verd,0);
1151 				return err;
1152 			}
1153 #else
1154 
1155 			return err;
1156 #endif
1157 		}
1158 
1159 	}
1160 	return -1;
1161 }
1162 
1163 static int psched_us_per_tick = 1;
1164 static int psched_tick_per_us = 1;
1165 
1166 #ifdef CONFIG_PROC_FS
1167 static int psched_show(struct seq_file *seq, void *v)
1168 {
1169 	seq_printf(seq, "%08x %08x %08x %08x\n",
1170 		      psched_tick_per_us, psched_us_per_tick,
1171 		      1000000, HZ);
1172 
1173 	return 0;
1174 }
1175 
1176 static int psched_open(struct inode *inode, struct file *file)
1177 {
1178 	return single_open(file, psched_show, PDE(inode)->data);
1179 }
1180 
1181 static struct file_operations psched_fops = {
1182 	.owner = THIS_MODULE,
1183 	.open = psched_open,
1184 	.read  = seq_read,
1185 	.llseek = seq_lseek,
1186 	.release = single_release,
1187 };
1188 #endif
1189 
1190 #ifdef CONFIG_NET_SCH_CLK_CPU
1191 psched_tdiff_t psched_clock_per_hz;
1192 int psched_clock_scale;
1193 EXPORT_SYMBOL(psched_clock_per_hz);
1194 EXPORT_SYMBOL(psched_clock_scale);
1195 
1196 psched_time_t psched_time_base;
1197 cycles_t psched_time_mark;
1198 EXPORT_SYMBOL(psched_time_mark);
1199 EXPORT_SYMBOL(psched_time_base);
1200 
1201 /*
1202  * Periodically adjust psched_time_base to avoid overflow
1203  * with 32-bit get_cycles(). Safe up to 4GHz CPU.
1204  */
1205 static void psched_tick(unsigned long);
1206 static struct timer_list psched_timer = TIMER_INITIALIZER(psched_tick, 0, 0);
1207 
1208 static void psched_tick(unsigned long dummy)
1209 {
1210 	if (sizeof(cycles_t) == sizeof(u32)) {
1211 		psched_time_t dummy_stamp;
1212 		PSCHED_GET_TIME(dummy_stamp);
1213 		psched_timer.expires = jiffies + 1*HZ;
1214 		add_timer(&psched_timer);
1215 	}
1216 }
1217 
1218 int __init psched_calibrate_clock(void)
1219 {
1220 	psched_time_t stamp, stamp1;
1221 	struct timeval tv, tv1;
1222 	psched_tdiff_t delay;
1223 	long rdelay;
1224 	unsigned long stop;
1225 
1226 	psched_tick(0);
1227 	stop = jiffies + HZ/10;
1228 	PSCHED_GET_TIME(stamp);
1229 	do_gettimeofday(&tv);
1230 	while (time_before(jiffies, stop)) {
1231 		barrier();
1232 		cpu_relax();
1233 	}
1234 	PSCHED_GET_TIME(stamp1);
1235 	do_gettimeofday(&tv1);
1236 
1237 	delay = PSCHED_TDIFF(stamp1, stamp);
1238 	rdelay = tv1.tv_usec - tv.tv_usec;
1239 	rdelay += (tv1.tv_sec - tv.tv_sec)*1000000;
1240 	if (rdelay > delay)
1241 		return -1;
1242 	delay /= rdelay;
1243 	psched_tick_per_us = delay;
1244 	while ((delay>>=1) != 0)
1245 		psched_clock_scale++;
1246 	psched_us_per_tick = 1<<psched_clock_scale;
1247 	psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
1248 	return 0;
1249 }
1250 #endif
1251 
1252 static int __init pktsched_init(void)
1253 {
1254 	struct rtnetlink_link *link_p;
1255 
1256 #ifdef CONFIG_NET_SCH_CLK_CPU
1257 	if (psched_calibrate_clock() < 0)
1258 		return -1;
1259 #elif defined(CONFIG_NET_SCH_CLK_JIFFIES)
1260 	psched_tick_per_us = HZ<<PSCHED_JSCALE;
1261 	psched_us_per_tick = 1000000;
1262 #endif
1263 
1264 	link_p = rtnetlink_links[PF_UNSPEC];
1265 
1266 	/* Setup rtnetlink links. It is made here to avoid
1267 	   exporting large number of public symbols.
1268 	 */
1269 
1270 	if (link_p) {
1271 		link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
1272 		link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
1273 		link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
1274 		link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
1275 		link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1276 		link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1277 		link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1278 		link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
1279 	}
1280 
1281 	register_qdisc(&pfifo_qdisc_ops);
1282 	register_qdisc(&bfifo_qdisc_ops);
1283 	proc_net_fops_create("psched", 0, &psched_fops);
1284 
1285 	return 0;
1286 }
1287 
1288 subsys_initcall(pktsched_init);
1289 
1290 EXPORT_SYMBOL(qdisc_lookup);
1291 EXPORT_SYMBOL(qdisc_get_rtab);
1292 EXPORT_SYMBOL(qdisc_put_rtab);
1293 EXPORT_SYMBOL(register_qdisc);
1294 EXPORT_SYMBOL(unregister_qdisc);
1295 EXPORT_SYMBOL(tc_classify);
1296