xref: /linux/net/sched/sch_api.c (revision 776cfebb430c7b22c208b1b17add97f354d97cab)
1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/config.h>
19 #include <linux/module.h>
20 #include <linux/types.h>
21 #include <linux/kernel.h>
22 #include <linux/sched.h>
23 #include <linux/string.h>
24 #include <linux/mm.h>
25 #include <linux/socket.h>
26 #include <linux/sockios.h>
27 #include <linux/in.h>
28 #include <linux/errno.h>
29 #include <linux/interrupt.h>
30 #include <linux/netdevice.h>
31 #include <linux/skbuff.h>
32 #include <linux/rtnetlink.h>
33 #include <linux/init.h>
34 #include <linux/proc_fs.h>
35 #include <linux/seq_file.h>
36 #include <linux/kmod.h>
37 #include <linux/list.h>
38 #include <linux/bitops.h>
39 
40 #include <net/sock.h>
41 #include <net/pkt_sched.h>
42 
43 #include <asm/processor.h>
44 #include <asm/uaccess.h>
45 #include <asm/system.h>
46 
47 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
48 			struct Qdisc *old, struct Qdisc *new);
49 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
50 			 struct Qdisc *q, unsigned long cl, int event);
51 
52 /*
53 
54    Short review.
55    -------------
56 
57    This file consists of two interrelated parts:
58 
59    1. queueing disciplines manager frontend.
60    2. traffic classes manager frontend.
61 
62    Generally, queueing discipline ("qdisc") is a black box,
63    which is able to enqueue packets and to dequeue them (when
64    device is ready to send something) in order and at times
65    determined by algorithm hidden in it.
66 
67    qdisc's are divided to two categories:
68    - "queues", which have no internal structure visible from outside.
69    - "schedulers", which split all the packets to "traffic classes",
70      using "packet classifiers" (look at cls_api.c)
71 
72    In turn, classes may have child qdiscs (as rule, queues)
73    attached to them etc. etc. etc.
74 
75    The goal of the routines in this file is to translate
76    information supplied by user in the form of handles
77    to more intelligible for kernel form, to make some sanity
78    checks and part of work, which is common to all qdiscs
79    and to provide rtnetlink notifications.
80 
81    All real intelligent work is done inside qdisc modules.
82 
83 
84 
85    Every discipline has two major routines: enqueue and dequeue.
86 
87    ---dequeue
88 
89    dequeue usually returns a skb to send. It is allowed to return NULL,
90    but it does not mean that queue is empty, it just means that
91    discipline does not want to send anything this time.
92    Queue is really empty if q->q.qlen == 0.
93    For complicated disciplines with multiple queues q->q is not
94    real packet queue, but however q->q.qlen must be valid.
95 
96    ---enqueue
97 
98    enqueue returns 0, if packet was enqueued successfully.
99    If packet (this one or another one) was dropped, it returns
100    not zero error code.
101    NET_XMIT_DROP 	- this packet dropped
102      Expected action: do not backoff, but wait until queue will clear.
103    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
104      Expected action: backoff or ignore
105    NET_XMIT_POLICED	- dropped by police.
106      Expected action: backoff or error to real-time apps.
107 
108    Auxiliary routines:
109 
110    ---requeue
111 
112    requeues once dequeued packet. It is used for non-standard or
113    just buggy devices, which can defer output even if dev->tbusy=0.
114 
115    ---reset
116 
117    returns qdisc to initial state: purge all buffers, clear all
118    timers, counters (except for statistics) etc.
119 
120    ---init
121 
122    initializes newly created qdisc.
123 
124    ---destroy
125 
126    destroys resources allocated by init and during lifetime of qdisc.
127 
128    ---change
129 
130    changes qdisc parameters.
131  */
132 
133 /* Protects list of registered TC modules. It is pure SMP lock. */
134 static DEFINE_RWLOCK(qdisc_mod_lock);
135 
136 
137 /************************************************
138  *	Queueing disciplines manipulation.	*
139  ************************************************/
140 
141 
142 /* The list of all installed queueing disciplines. */
143 
144 static struct Qdisc_ops *qdisc_base;
145 
146 /* Register/uregister queueing discipline */
147 
148 int register_qdisc(struct Qdisc_ops *qops)
149 {
150 	struct Qdisc_ops *q, **qp;
151 	int rc = -EEXIST;
152 
153 	write_lock(&qdisc_mod_lock);
154 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
155 		if (!strcmp(qops->id, q->id))
156 			goto out;
157 
158 	if (qops->enqueue == NULL)
159 		qops->enqueue = noop_qdisc_ops.enqueue;
160 	if (qops->requeue == NULL)
161 		qops->requeue = noop_qdisc_ops.requeue;
162 	if (qops->dequeue == NULL)
163 		qops->dequeue = noop_qdisc_ops.dequeue;
164 
165 	qops->next = NULL;
166 	*qp = qops;
167 	rc = 0;
168 out:
169 	write_unlock(&qdisc_mod_lock);
170 	return rc;
171 }
172 
173 int unregister_qdisc(struct Qdisc_ops *qops)
174 {
175 	struct Qdisc_ops *q, **qp;
176 	int err = -ENOENT;
177 
178 	write_lock(&qdisc_mod_lock);
179 	for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
180 		if (q == qops)
181 			break;
182 	if (q) {
183 		*qp = q->next;
184 		q->next = NULL;
185 		err = 0;
186 	}
187 	write_unlock(&qdisc_mod_lock);
188 	return err;
189 }
190 
191 /* We know handle. Find qdisc among all qdisc's attached to device
192    (root qdisc, all its children, children of children etc.)
193  */
194 
195 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
196 {
197 	struct Qdisc *q;
198 
199 	read_lock_bh(&qdisc_tree_lock);
200 	list_for_each_entry(q, &dev->qdisc_list, list) {
201 		if (q->handle == handle) {
202 			read_unlock_bh(&qdisc_tree_lock);
203 			return q;
204 		}
205 	}
206 	read_unlock_bh(&qdisc_tree_lock);
207 	return NULL;
208 }
209 
210 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
211 {
212 	unsigned long cl;
213 	struct Qdisc *leaf;
214 	struct Qdisc_class_ops *cops = p->ops->cl_ops;
215 
216 	if (cops == NULL)
217 		return NULL;
218 	cl = cops->get(p, classid);
219 
220 	if (cl == 0)
221 		return NULL;
222 	leaf = cops->leaf(p, cl);
223 	cops->put(p, cl);
224 	return leaf;
225 }
226 
227 /* Find queueing discipline by name */
228 
229 static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
230 {
231 	struct Qdisc_ops *q = NULL;
232 
233 	if (kind) {
234 		read_lock(&qdisc_mod_lock);
235 		for (q = qdisc_base; q; q = q->next) {
236 			if (rtattr_strcmp(kind, q->id) == 0) {
237 				if (!try_module_get(q->owner))
238 					q = NULL;
239 				break;
240 			}
241 		}
242 		read_unlock(&qdisc_mod_lock);
243 	}
244 	return q;
245 }
246 
247 static struct qdisc_rate_table *qdisc_rtab_list;
248 
249 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
250 {
251 	struct qdisc_rate_table *rtab;
252 
253 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
254 		if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
255 			rtab->refcnt++;
256 			return rtab;
257 		}
258 	}
259 
260 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
261 		return NULL;
262 
263 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
264 	if (rtab) {
265 		rtab->rate = *r;
266 		rtab->refcnt = 1;
267 		memcpy(rtab->data, RTA_DATA(tab), 1024);
268 		rtab->next = qdisc_rtab_list;
269 		qdisc_rtab_list = rtab;
270 	}
271 	return rtab;
272 }
273 
274 void qdisc_put_rtab(struct qdisc_rate_table *tab)
275 {
276 	struct qdisc_rate_table *rtab, **rtabp;
277 
278 	if (!tab || --tab->refcnt)
279 		return;
280 
281 	for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
282 		if (rtab == tab) {
283 			*rtabp = rtab->next;
284 			kfree(rtab);
285 			return;
286 		}
287 	}
288 }
289 
290 
291 /* Allocate an unique handle from space managed by kernel */
292 
293 static u32 qdisc_alloc_handle(struct net_device *dev)
294 {
295 	int i = 0x10000;
296 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
297 
298 	do {
299 		autohandle += TC_H_MAKE(0x10000U, 0);
300 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
301 			autohandle = TC_H_MAKE(0x80000000U, 0);
302 	} while	(qdisc_lookup(dev, autohandle) && --i > 0);
303 
304 	return i>0 ? autohandle : 0;
305 }
306 
307 /* Attach toplevel qdisc to device dev */
308 
309 static struct Qdisc *
310 dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
311 {
312 	struct Qdisc *oqdisc;
313 
314 	if (dev->flags & IFF_UP)
315 		dev_deactivate(dev);
316 
317 	qdisc_lock_tree(dev);
318 	if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
319 		oqdisc = dev->qdisc_ingress;
320 		/* Prune old scheduler */
321 		if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
322 			/* delete */
323 			qdisc_reset(oqdisc);
324 			dev->qdisc_ingress = NULL;
325 		} else {  /* new */
326 			dev->qdisc_ingress = qdisc;
327 		}
328 
329 	} else {
330 
331 		oqdisc = dev->qdisc_sleeping;
332 
333 		/* Prune old scheduler */
334 		if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
335 			qdisc_reset(oqdisc);
336 
337 		/* ... and graft new one */
338 		if (qdisc == NULL)
339 			qdisc = &noop_qdisc;
340 		dev->qdisc_sleeping = qdisc;
341 		dev->qdisc = &noop_qdisc;
342 	}
343 
344 	qdisc_unlock_tree(dev);
345 
346 	if (dev->flags & IFF_UP)
347 		dev_activate(dev);
348 
349 	return oqdisc;
350 }
351 
352 
353 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
354    to device "dev".
355 
356    Old qdisc is not destroyed but returned in *old.
357  */
358 
359 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
360 		       u32 classid,
361 		       struct Qdisc *new, struct Qdisc **old)
362 {
363 	int err = 0;
364 	struct Qdisc *q = *old;
365 
366 
367 	if (parent == NULL) {
368 		if (q && q->flags&TCQ_F_INGRESS) {
369 			*old = dev_graft_qdisc(dev, q);
370 		} else {
371 			*old = dev_graft_qdisc(dev, new);
372 		}
373 	} else {
374 		struct Qdisc_class_ops *cops = parent->ops->cl_ops;
375 
376 		err = -EINVAL;
377 
378 		if (cops) {
379 			unsigned long cl = cops->get(parent, classid);
380 			if (cl) {
381 				err = cops->graft(parent, cl, new, old);
382 				if (new)
383 					new->parent = classid;
384 				cops->put(parent, cl);
385 			}
386 		}
387 	}
388 	return err;
389 }
390 
391 /*
392    Allocate and initialize new qdisc.
393 
394    Parameters are passed via opt.
395  */
396 
397 static struct Qdisc *
398 qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
399 {
400 	int err;
401 	struct rtattr *kind = tca[TCA_KIND-1];
402 	void *p = NULL;
403 	struct Qdisc *sch;
404 	struct Qdisc_ops *ops;
405 	int size;
406 
407 	ops = qdisc_lookup_ops(kind);
408 #ifdef CONFIG_KMOD
409 	if (ops == NULL && kind != NULL) {
410 		char name[IFNAMSIZ];
411 		if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
412 			/* We dropped the RTNL semaphore in order to
413 			 * perform the module load.  So, even if we
414 			 * succeeded in loading the module we have to
415 			 * tell the caller to replay the request.  We
416 			 * indicate this using -EAGAIN.
417 			 * We replay the request because the device may
418 			 * go away in the mean time.
419 			 */
420 			rtnl_unlock();
421 			request_module("sch_%s", name);
422 			rtnl_lock();
423 			ops = qdisc_lookup_ops(kind);
424 			if (ops != NULL) {
425 				/* We will try again qdisc_lookup_ops,
426 				 * so don't keep a reference.
427 				 */
428 				module_put(ops->owner);
429 				err = -EAGAIN;
430 				goto err_out;
431 			}
432 		}
433 	}
434 #endif
435 
436 	err = -EINVAL;
437 	if (ops == NULL)
438 		goto err_out;
439 
440 	/* ensure that the Qdisc and the private data are 32-byte aligned */
441 	size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST);
442 	size += ops->priv_size + QDISC_ALIGN_CONST;
443 
444 	p = kmalloc(size, GFP_KERNEL);
445 	err = -ENOBUFS;
446 	if (!p)
447 		goto err_out2;
448 	memset(p, 0, size);
449 	sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST)
450 	                       & ~QDISC_ALIGN_CONST);
451 	sch->padded = (char *)sch - (char *)p;
452 
453 	INIT_LIST_HEAD(&sch->list);
454 	skb_queue_head_init(&sch->q);
455 
456 	if (handle == TC_H_INGRESS)
457 		sch->flags |= TCQ_F_INGRESS;
458 
459 	sch->ops = ops;
460 	sch->enqueue = ops->enqueue;
461 	sch->dequeue = ops->dequeue;
462 	sch->dev = dev;
463 	dev_hold(dev);
464 	atomic_set(&sch->refcnt, 1);
465 	sch->stats_lock = &dev->queue_lock;
466 	if (handle == 0) {
467 		handle = qdisc_alloc_handle(dev);
468 		err = -ENOMEM;
469 		if (handle == 0)
470 			goto err_out3;
471 	}
472 
473 	if (handle == TC_H_INGRESS)
474                 sch->handle =TC_H_MAKE(TC_H_INGRESS, 0);
475         else
476                 sch->handle = handle;
477 
478 	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
479 		qdisc_lock_tree(dev);
480 		list_add_tail(&sch->list, &dev->qdisc_list);
481 		qdisc_unlock_tree(dev);
482 
483 #ifdef CONFIG_NET_ESTIMATOR
484 		if (tca[TCA_RATE-1])
485 			gen_new_estimator(&sch->bstats, &sch->rate_est,
486 				sch->stats_lock, tca[TCA_RATE-1]);
487 #endif
488 		return sch;
489 	}
490 err_out3:
491 	dev_put(dev);
492 err_out2:
493 	module_put(ops->owner);
494 err_out:
495 	*errp = err;
496 	if (p)
497 		kfree(p);
498 	return NULL;
499 }
500 
501 static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
502 {
503 	if (tca[TCA_OPTIONS-1]) {
504 		int err;
505 
506 		if (sch->ops->change == NULL)
507 			return -EINVAL;
508 		err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
509 		if (err)
510 			return err;
511 	}
512 #ifdef CONFIG_NET_ESTIMATOR
513 	if (tca[TCA_RATE-1])
514 		gen_replace_estimator(&sch->bstats, &sch->rate_est,
515 			sch->stats_lock, tca[TCA_RATE-1]);
516 #endif
517 	return 0;
518 }
519 
520 struct check_loop_arg
521 {
522 	struct qdisc_walker 	w;
523 	struct Qdisc		*p;
524 	int			depth;
525 };
526 
527 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
528 
529 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
530 {
531 	struct check_loop_arg	arg;
532 
533 	if (q->ops->cl_ops == NULL)
534 		return 0;
535 
536 	arg.w.stop = arg.w.skip = arg.w.count = 0;
537 	arg.w.fn = check_loop_fn;
538 	arg.depth = depth;
539 	arg.p = p;
540 	q->ops->cl_ops->walk(q, &arg.w);
541 	return arg.w.stop ? -ELOOP : 0;
542 }
543 
544 static int
545 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
546 {
547 	struct Qdisc *leaf;
548 	struct Qdisc_class_ops *cops = q->ops->cl_ops;
549 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
550 
551 	leaf = cops->leaf(q, cl);
552 	if (leaf) {
553 		if (leaf == arg->p || arg->depth > 7)
554 			return -ELOOP;
555 		return check_loop(leaf, arg->p, arg->depth + 1);
556 	}
557 	return 0;
558 }
559 
560 /*
561  * Delete/get qdisc.
562  */
563 
564 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
565 {
566 	struct tcmsg *tcm = NLMSG_DATA(n);
567 	struct rtattr **tca = arg;
568 	struct net_device *dev;
569 	u32 clid = tcm->tcm_parent;
570 	struct Qdisc *q = NULL;
571 	struct Qdisc *p = NULL;
572 	int err;
573 
574 	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
575 		return -ENODEV;
576 
577 	if (clid) {
578 		if (clid != TC_H_ROOT) {
579 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
580 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
581 					return -ENOENT;
582 				q = qdisc_leaf(p, clid);
583 			} else { /* ingress */
584 				q = dev->qdisc_ingress;
585                         }
586 		} else {
587 			q = dev->qdisc_sleeping;
588 		}
589 		if (!q)
590 			return -ENOENT;
591 
592 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
593 			return -EINVAL;
594 	} else {
595 		if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
596 			return -ENOENT;
597 	}
598 
599 	if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
600 		return -EINVAL;
601 
602 	if (n->nlmsg_type == RTM_DELQDISC) {
603 		if (!clid)
604 			return -EINVAL;
605 		if (q->handle == 0)
606 			return -ENOENT;
607 		if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
608 			return err;
609 		if (q) {
610 			qdisc_notify(skb, n, clid, q, NULL);
611 			spin_lock_bh(&dev->queue_lock);
612 			qdisc_destroy(q);
613 			spin_unlock_bh(&dev->queue_lock);
614 		}
615 	} else {
616 		qdisc_notify(skb, n, clid, NULL, q);
617 	}
618 	return 0;
619 }
620 
621 /*
622    Create/change qdisc.
623  */
624 
625 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
626 {
627 	struct tcmsg *tcm;
628 	struct rtattr **tca;
629 	struct net_device *dev;
630 	u32 clid;
631 	struct Qdisc *q, *p;
632 	int err;
633 
634 replay:
635 	/* Reinit, just in case something touches this. */
636 	tcm = NLMSG_DATA(n);
637 	tca = arg;
638 	clid = tcm->tcm_parent;
639 	q = p = NULL;
640 
641 	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
642 		return -ENODEV;
643 
644 	if (clid) {
645 		if (clid != TC_H_ROOT) {
646 			if (clid != TC_H_INGRESS) {
647 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
648 					return -ENOENT;
649 				q = qdisc_leaf(p, clid);
650 			} else { /*ingress */
651 				q = dev->qdisc_ingress;
652 			}
653 		} else {
654 			q = dev->qdisc_sleeping;
655 		}
656 
657 		/* It may be default qdisc, ignore it */
658 		if (q && q->handle == 0)
659 			q = NULL;
660 
661 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
662 			if (tcm->tcm_handle) {
663 				if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
664 					return -EEXIST;
665 				if (TC_H_MIN(tcm->tcm_handle))
666 					return -EINVAL;
667 				if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
668 					goto create_n_graft;
669 				if (n->nlmsg_flags&NLM_F_EXCL)
670 					return -EEXIST;
671 				if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
672 					return -EINVAL;
673 				if (q == p ||
674 				    (p && check_loop(q, p, 0)))
675 					return -ELOOP;
676 				atomic_inc(&q->refcnt);
677 				goto graft;
678 			} else {
679 				if (q == NULL)
680 					goto create_n_graft;
681 
682 				/* This magic test requires explanation.
683 				 *
684 				 *   We know, that some child q is already
685 				 *   attached to this parent and have choice:
686 				 *   either to change it or to create/graft new one.
687 				 *
688 				 *   1. We are allowed to create/graft only
689 				 *   if CREATE and REPLACE flags are set.
690 				 *
691 				 *   2. If EXCL is set, requestor wanted to say,
692 				 *   that qdisc tcm_handle is not expected
693 				 *   to exist, so that we choose create/graft too.
694 				 *
695 				 *   3. The last case is when no flags are set.
696 				 *   Alas, it is sort of hole in API, we
697 				 *   cannot decide what to do unambiguously.
698 				 *   For now we select create/graft, if
699 				 *   user gave KIND, which does not match existing.
700 				 */
701 				if ((n->nlmsg_flags&NLM_F_CREATE) &&
702 				    (n->nlmsg_flags&NLM_F_REPLACE) &&
703 				    ((n->nlmsg_flags&NLM_F_EXCL) ||
704 				     (tca[TCA_KIND-1] &&
705 				      rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
706 					goto create_n_graft;
707 			}
708 		}
709 	} else {
710 		if (!tcm->tcm_handle)
711 			return -EINVAL;
712 		q = qdisc_lookup(dev, tcm->tcm_handle);
713 	}
714 
715 	/* Change qdisc parameters */
716 	if (q == NULL)
717 		return -ENOENT;
718 	if (n->nlmsg_flags&NLM_F_EXCL)
719 		return -EEXIST;
720 	if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
721 		return -EINVAL;
722 	err = qdisc_change(q, tca);
723 	if (err == 0)
724 		qdisc_notify(skb, n, clid, NULL, q);
725 	return err;
726 
727 create_n_graft:
728 	if (!(n->nlmsg_flags&NLM_F_CREATE))
729 		return -ENOENT;
730 	if (clid == TC_H_INGRESS)
731 		q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
732         else
733 		q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
734 	if (q == NULL) {
735 		if (err == -EAGAIN)
736 			goto replay;
737 		return err;
738 	}
739 
740 graft:
741 	if (1) {
742 		struct Qdisc *old_q = NULL;
743 		err = qdisc_graft(dev, p, clid, q, &old_q);
744 		if (err) {
745 			if (q) {
746 				spin_lock_bh(&dev->queue_lock);
747 				qdisc_destroy(q);
748 				spin_unlock_bh(&dev->queue_lock);
749 			}
750 			return err;
751 		}
752 		qdisc_notify(skb, n, clid, old_q, q);
753 		if (old_q) {
754 			spin_lock_bh(&dev->queue_lock);
755 			qdisc_destroy(old_q);
756 			spin_unlock_bh(&dev->queue_lock);
757 		}
758 	}
759 	return 0;
760 }
761 
762 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
763 			 u32 pid, u32 seq, unsigned flags, int event)
764 {
765 	struct tcmsg *tcm;
766 	struct nlmsghdr  *nlh;
767 	unsigned char	 *b = skb->tail;
768 	struct gnet_dump d;
769 
770 	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
771 	nlh->nlmsg_flags = flags;
772 	tcm = NLMSG_DATA(nlh);
773 	tcm->tcm_family = AF_UNSPEC;
774 	tcm->tcm_ifindex = q->dev->ifindex;
775 	tcm->tcm_parent = clid;
776 	tcm->tcm_handle = q->handle;
777 	tcm->tcm_info = atomic_read(&q->refcnt);
778 	RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
779 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
780 		goto rtattr_failure;
781 	q->qstats.qlen = q->q.qlen;
782 
783 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
784 			TCA_XSTATS, q->stats_lock, &d) < 0)
785 		goto rtattr_failure;
786 
787 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
788 		goto rtattr_failure;
789 
790 	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
791 #ifdef CONFIG_NET_ESTIMATOR
792 	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
793 #endif
794 	    gnet_stats_copy_queue(&d, &q->qstats) < 0)
795 		goto rtattr_failure;
796 
797 	if (gnet_stats_finish_copy(&d) < 0)
798 		goto rtattr_failure;
799 
800 	nlh->nlmsg_len = skb->tail - b;
801 	return skb->len;
802 
803 nlmsg_failure:
804 rtattr_failure:
805 	skb_trim(skb, b - skb->data);
806 	return -1;
807 }
808 
809 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
810 			u32 clid, struct Qdisc *old, struct Qdisc *new)
811 {
812 	struct sk_buff *skb;
813 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
814 
815 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
816 	if (!skb)
817 		return -ENOBUFS;
818 
819 	if (old && old->handle) {
820 		if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
821 			goto err_out;
822 	}
823 	if (new) {
824 		if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
825 			goto err_out;
826 	}
827 
828 	if (skb->len)
829 		return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
830 
831 err_out:
832 	kfree_skb(skb);
833 	return -EINVAL;
834 }
835 
836 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
837 {
838 	int idx, q_idx;
839 	int s_idx, s_q_idx;
840 	struct net_device *dev;
841 	struct Qdisc *q;
842 
843 	s_idx = cb->args[0];
844 	s_q_idx = q_idx = cb->args[1];
845 	read_lock(&dev_base_lock);
846 	for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
847 		if (idx < s_idx)
848 			continue;
849 		if (idx > s_idx)
850 			s_q_idx = 0;
851 		read_lock_bh(&qdisc_tree_lock);
852 		q_idx = 0;
853 		list_for_each_entry(q, &dev->qdisc_list, list) {
854 			if (q_idx < s_q_idx) {
855 				q_idx++;
856 				continue;
857 			}
858 			if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
859 					  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
860 				read_unlock_bh(&qdisc_tree_lock);
861 				goto done;
862 			}
863 			q_idx++;
864 		}
865 		read_unlock_bh(&qdisc_tree_lock);
866 	}
867 
868 done:
869 	read_unlock(&dev_base_lock);
870 
871 	cb->args[0] = idx;
872 	cb->args[1] = q_idx;
873 
874 	return skb->len;
875 }
876 
877 
878 
879 /************************************************
880  *	Traffic classes manipulation.		*
881  ************************************************/
882 
883 
884 
885 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
886 {
887 	struct tcmsg *tcm = NLMSG_DATA(n);
888 	struct rtattr **tca = arg;
889 	struct net_device *dev;
890 	struct Qdisc *q = NULL;
891 	struct Qdisc_class_ops *cops;
892 	unsigned long cl = 0;
893 	unsigned long new_cl;
894 	u32 pid = tcm->tcm_parent;
895 	u32 clid = tcm->tcm_handle;
896 	u32 qid = TC_H_MAJ(clid);
897 	int err;
898 
899 	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
900 		return -ENODEV;
901 
902 	/*
903 	   parent == TC_H_UNSPEC - unspecified parent.
904 	   parent == TC_H_ROOT   - class is root, which has no parent.
905 	   parent == X:0	 - parent is root class.
906 	   parent == X:Y	 - parent is a node in hierarchy.
907 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
908 
909 	   handle == 0:0	 - generate handle from kernel pool.
910 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
911 	   handle == X:Y	 - clear.
912 	   handle == X:0	 - root class.
913 	 */
914 
915 	/* Step 1. Determine qdisc handle X:0 */
916 
917 	if (pid != TC_H_ROOT) {
918 		u32 qid1 = TC_H_MAJ(pid);
919 
920 		if (qid && qid1) {
921 			/* If both majors are known, they must be identical. */
922 			if (qid != qid1)
923 				return -EINVAL;
924 		} else if (qid1) {
925 			qid = qid1;
926 		} else if (qid == 0)
927 			qid = dev->qdisc_sleeping->handle;
928 
929 		/* Now qid is genuine qdisc handle consistent
930 		   both with parent and child.
931 
932 		   TC_H_MAJ(pid) still may be unspecified, complete it now.
933 		 */
934 		if (pid)
935 			pid = TC_H_MAKE(qid, pid);
936 	} else {
937 		if (qid == 0)
938 			qid = dev->qdisc_sleeping->handle;
939 	}
940 
941 	/* OK. Locate qdisc */
942 	if ((q = qdisc_lookup(dev, qid)) == NULL)
943 		return -ENOENT;
944 
945 	/* An check that it supports classes */
946 	cops = q->ops->cl_ops;
947 	if (cops == NULL)
948 		return -EINVAL;
949 
950 	/* Now try to get class */
951 	if (clid == 0) {
952 		if (pid == TC_H_ROOT)
953 			clid = qid;
954 	} else
955 		clid = TC_H_MAKE(qid, clid);
956 
957 	if (clid)
958 		cl = cops->get(q, clid);
959 
960 	if (cl == 0) {
961 		err = -ENOENT;
962 		if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
963 			goto out;
964 	} else {
965 		switch (n->nlmsg_type) {
966 		case RTM_NEWTCLASS:
967 			err = -EEXIST;
968 			if (n->nlmsg_flags&NLM_F_EXCL)
969 				goto out;
970 			break;
971 		case RTM_DELTCLASS:
972 			err = cops->delete(q, cl);
973 			if (err == 0)
974 				tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
975 			goto out;
976 		case RTM_GETTCLASS:
977 			err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
978 			goto out;
979 		default:
980 			err = -EINVAL;
981 			goto out;
982 		}
983 	}
984 
985 	new_cl = cl;
986 	err = cops->change(q, clid, pid, tca, &new_cl);
987 	if (err == 0)
988 		tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
989 
990 out:
991 	if (cl)
992 		cops->put(q, cl);
993 
994 	return err;
995 }
996 
997 
998 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
999 			  unsigned long cl,
1000 			  u32 pid, u32 seq, unsigned flags, int event)
1001 {
1002 	struct tcmsg *tcm;
1003 	struct nlmsghdr  *nlh;
1004 	unsigned char	 *b = skb->tail;
1005 	struct gnet_dump d;
1006 	struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1007 
1008 	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
1009 	nlh->nlmsg_flags = flags;
1010 	tcm = NLMSG_DATA(nlh);
1011 	tcm->tcm_family = AF_UNSPEC;
1012 	tcm->tcm_ifindex = q->dev->ifindex;
1013 	tcm->tcm_parent = q->handle;
1014 	tcm->tcm_handle = q->handle;
1015 	tcm->tcm_info = 0;
1016 	RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
1017 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1018 		goto rtattr_failure;
1019 
1020 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
1021 			TCA_XSTATS, q->stats_lock, &d) < 0)
1022 		goto rtattr_failure;
1023 
1024 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1025 		goto rtattr_failure;
1026 
1027 	if (gnet_stats_finish_copy(&d) < 0)
1028 		goto rtattr_failure;
1029 
1030 	nlh->nlmsg_len = skb->tail - b;
1031 	return skb->len;
1032 
1033 nlmsg_failure:
1034 rtattr_failure:
1035 	skb_trim(skb, b - skb->data);
1036 	return -1;
1037 }
1038 
1039 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1040 			  struct Qdisc *q, unsigned long cl, int event)
1041 {
1042 	struct sk_buff *skb;
1043 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1044 
1045 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1046 	if (!skb)
1047 		return -ENOBUFS;
1048 
1049 	if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1050 		kfree_skb(skb);
1051 		return -EINVAL;
1052 	}
1053 
1054 	return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1055 }
1056 
1057 struct qdisc_dump_args
1058 {
1059 	struct qdisc_walker w;
1060 	struct sk_buff *skb;
1061 	struct netlink_callback *cb;
1062 };
1063 
1064 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1065 {
1066 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1067 
1068 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1069 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1070 }
1071 
1072 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1073 {
1074 	int t;
1075 	int s_t;
1076 	struct net_device *dev;
1077 	struct Qdisc *q;
1078 	struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1079 	struct qdisc_dump_args arg;
1080 
1081 	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1082 		return 0;
1083 	if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1084 		return 0;
1085 
1086 	s_t = cb->args[0];
1087 	t = 0;
1088 
1089 	read_lock_bh(&qdisc_tree_lock);
1090 	list_for_each_entry(q, &dev->qdisc_list, list) {
1091 		if (t < s_t || !q->ops->cl_ops ||
1092 		    (tcm->tcm_parent &&
1093 		     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1094 			t++;
1095 			continue;
1096 		}
1097 		if (t > s_t)
1098 			memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1099 		arg.w.fn = qdisc_class_dump;
1100 		arg.skb = skb;
1101 		arg.cb = cb;
1102 		arg.w.stop  = 0;
1103 		arg.w.skip = cb->args[1];
1104 		arg.w.count = 0;
1105 		q->ops->cl_ops->walk(q, &arg.w);
1106 		cb->args[1] = arg.w.count;
1107 		if (arg.w.stop)
1108 			break;
1109 		t++;
1110 	}
1111 	read_unlock_bh(&qdisc_tree_lock);
1112 
1113 	cb->args[0] = t;
1114 
1115 	dev_put(dev);
1116 	return skb->len;
1117 }
1118 
1119 /* Main classifier routine: scans classifier chain attached
1120    to this qdisc, (optionally) tests for protocol and asks
1121    specific classifiers.
1122  */
1123 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1124 	struct tcf_result *res)
1125 {
1126 	int err = 0;
1127 	u32 protocol = skb->protocol;
1128 #ifdef CONFIG_NET_CLS_ACT
1129 	struct tcf_proto *otp = tp;
1130 reclassify:
1131 #endif
1132 	protocol = skb->protocol;
1133 
1134 	for ( ; tp; tp = tp->next) {
1135 		if ((tp->protocol == protocol ||
1136 			tp->protocol == __constant_htons(ETH_P_ALL)) &&
1137 			(err = tp->classify(skb, tp, res)) >= 0) {
1138 #ifdef CONFIG_NET_CLS_ACT
1139 			if ( TC_ACT_RECLASSIFY == err) {
1140 				__u32 verd = (__u32) G_TC_VERD(skb->tc_verd);
1141 				tp = otp;
1142 
1143 				if (MAX_REC_LOOP < verd++) {
1144 					printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
1145 						tp->prio&0xffff, ntohs(tp->protocol));
1146 					return TC_ACT_SHOT;
1147 				}
1148 				skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd);
1149 				goto reclassify;
1150 			} else {
1151 				if (skb->tc_verd)
1152 					skb->tc_verd = SET_TC_VERD(skb->tc_verd,0);
1153 				return err;
1154 			}
1155 #else
1156 
1157 			return err;
1158 #endif
1159 		}
1160 
1161 	}
1162 	return -1;
1163 }
1164 
1165 static int psched_us_per_tick = 1;
1166 static int psched_tick_per_us = 1;
1167 
1168 #ifdef CONFIG_PROC_FS
1169 static int psched_show(struct seq_file *seq, void *v)
1170 {
1171 	seq_printf(seq, "%08x %08x %08x %08x\n",
1172 		      psched_tick_per_us, psched_us_per_tick,
1173 		      1000000, HZ);
1174 
1175 	return 0;
1176 }
1177 
1178 static int psched_open(struct inode *inode, struct file *file)
1179 {
1180 	return single_open(file, psched_show, PDE(inode)->data);
1181 }
1182 
1183 static struct file_operations psched_fops = {
1184 	.owner = THIS_MODULE,
1185 	.open = psched_open,
1186 	.read  = seq_read,
1187 	.llseek = seq_lseek,
1188 	.release = single_release,
1189 };
1190 #endif
1191 
1192 #ifdef CONFIG_NET_SCH_CLK_CPU
1193 psched_tdiff_t psched_clock_per_hz;
1194 int psched_clock_scale;
1195 EXPORT_SYMBOL(psched_clock_per_hz);
1196 EXPORT_SYMBOL(psched_clock_scale);
1197 
1198 psched_time_t psched_time_base;
1199 cycles_t psched_time_mark;
1200 EXPORT_SYMBOL(psched_time_mark);
1201 EXPORT_SYMBOL(psched_time_base);
1202 
1203 /*
1204  * Periodically adjust psched_time_base to avoid overflow
1205  * with 32-bit get_cycles(). Safe up to 4GHz CPU.
1206  */
1207 static void psched_tick(unsigned long);
1208 static struct timer_list psched_timer = TIMER_INITIALIZER(psched_tick, 0, 0);
1209 
1210 static void psched_tick(unsigned long dummy)
1211 {
1212 	if (sizeof(cycles_t) == sizeof(u32)) {
1213 		psched_time_t dummy_stamp;
1214 		PSCHED_GET_TIME(dummy_stamp);
1215 		psched_timer.expires = jiffies + 1*HZ;
1216 		add_timer(&psched_timer);
1217 	}
1218 }
1219 
1220 int __init psched_calibrate_clock(void)
1221 {
1222 	psched_time_t stamp, stamp1;
1223 	struct timeval tv, tv1;
1224 	psched_tdiff_t delay;
1225 	long rdelay;
1226 	unsigned long stop;
1227 
1228 	psched_tick(0);
1229 	stop = jiffies + HZ/10;
1230 	PSCHED_GET_TIME(stamp);
1231 	do_gettimeofday(&tv);
1232 	while (time_before(jiffies, stop)) {
1233 		barrier();
1234 		cpu_relax();
1235 	}
1236 	PSCHED_GET_TIME(stamp1);
1237 	do_gettimeofday(&tv1);
1238 
1239 	delay = PSCHED_TDIFF(stamp1, stamp);
1240 	rdelay = tv1.tv_usec - tv.tv_usec;
1241 	rdelay += (tv1.tv_sec - tv.tv_sec)*1000000;
1242 	if (rdelay > delay)
1243 		return -1;
1244 	delay /= rdelay;
1245 	psched_tick_per_us = delay;
1246 	while ((delay>>=1) != 0)
1247 		psched_clock_scale++;
1248 	psched_us_per_tick = 1<<psched_clock_scale;
1249 	psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
1250 	return 0;
1251 }
1252 #endif
1253 
1254 static int __init pktsched_init(void)
1255 {
1256 	struct rtnetlink_link *link_p;
1257 
1258 #ifdef CONFIG_NET_SCH_CLK_CPU
1259 	if (psched_calibrate_clock() < 0)
1260 		return -1;
1261 #elif defined(CONFIG_NET_SCH_CLK_JIFFIES)
1262 	psched_tick_per_us = HZ<<PSCHED_JSCALE;
1263 	psched_us_per_tick = 1000000;
1264 #endif
1265 
1266 	link_p = rtnetlink_links[PF_UNSPEC];
1267 
1268 	/* Setup rtnetlink links. It is made here to avoid
1269 	   exporting large number of public symbols.
1270 	 */
1271 
1272 	if (link_p) {
1273 		link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
1274 		link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
1275 		link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
1276 		link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
1277 		link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1278 		link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1279 		link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1280 		link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
1281 	}
1282 
1283 	register_qdisc(&pfifo_qdisc_ops);
1284 	register_qdisc(&bfifo_qdisc_ops);
1285 	proc_net_fops_create("psched", 0, &psched_fops);
1286 
1287 	return 0;
1288 }
1289 
1290 subsys_initcall(pktsched_init);
1291 
1292 EXPORT_SYMBOL(qdisc_lookup);
1293 EXPORT_SYMBOL(qdisc_get_rtab);
1294 EXPORT_SYMBOL(qdisc_put_rtab);
1295 EXPORT_SYMBOL(register_qdisc);
1296 EXPORT_SYMBOL(unregister_qdisc);
1297 EXPORT_SYMBOL(tc_classify);
1298