xref: /linux/net/sched/sch_api.c (revision a0b54e256d513ed99e456bea6e4e188ff92e7c46)
1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 
32 #include <net/net_namespace.h>
33 #include <net/sock.h>
34 #include <net/netlink.h>
35 #include <net/pkt_sched.h>
36 
37 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
38 			struct Qdisc *old, struct Qdisc *new);
39 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
40 			 struct Qdisc *q, unsigned long cl, int event);
41 
42 /*
43 
44    Short review.
45    -------------
46 
47    This file consists of two interrelated parts:
48 
49    1. queueing disciplines manager frontend.
50    2. traffic classes manager frontend.
51 
52    Generally, queueing discipline ("qdisc") is a black box,
53    which is able to enqueue packets and to dequeue them (when
54    device is ready to send something) in order and at times
55    determined by algorithm hidden in it.
56 
57    qdisc's are divided to two categories:
58    - "queues", which have no internal structure visible from outside.
59    - "schedulers", which split all the packets to "traffic classes",
60      using "packet classifiers" (look at cls_api.c)
61 
62    In turn, classes may have child qdiscs (as rule, queues)
63    attached to them etc. etc. etc.
64 
65    The goal of the routines in this file is to translate
66    information supplied by user in the form of handles
67    to more intelligible for kernel form, to make some sanity
68    checks and part of work, which is common to all qdiscs
69    and to provide rtnetlink notifications.
70 
71    All real intelligent work is done inside qdisc modules.
72 
73 
74 
75    Every discipline has two major routines: enqueue and dequeue.
76 
77    ---dequeue
78 
79    dequeue usually returns a skb to send. It is allowed to return NULL,
80    but it does not mean that queue is empty, it just means that
81    discipline does not want to send anything this time.
82    Queue is really empty if q->q.qlen == 0.
83    For complicated disciplines with multiple queues q->q is not
84    real packet queue, but however q->q.qlen must be valid.
85 
86    ---enqueue
87 
88    enqueue returns 0, if packet was enqueued successfully.
89    If packet (this one or another one) was dropped, it returns
90    not zero error code.
91    NET_XMIT_DROP 	- this packet dropped
92      Expected action: do not backoff, but wait until queue will clear.
93    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
94      Expected action: backoff or ignore
95    NET_XMIT_POLICED	- dropped by police.
96      Expected action: backoff or error to real-time apps.
97 
98    Auxiliary routines:
99 
100    ---peek
101 
102    like dequeue but without removing a packet from the queue
103 
104    ---reset
105 
106    returns qdisc to initial state: purge all buffers, clear all
107    timers, counters (except for statistics) etc.
108 
109    ---init
110 
111    initializes newly created qdisc.
112 
113    ---destroy
114 
115    destroys resources allocated by init and during lifetime of qdisc.
116 
117    ---change
118 
119    changes qdisc parameters.
120  */
121 
122 /* Protects list of registered TC modules. It is pure SMP lock. */
123 static DEFINE_RWLOCK(qdisc_mod_lock);
124 
125 
126 /************************************************
127  *	Queueing disciplines manipulation.	*
128  ************************************************/
129 
130 
131 /* The list of all installed queueing disciplines. */
132 
133 static struct Qdisc_ops *qdisc_base;
134 
135 /* Register/uregister queueing discipline */
136 
137 int register_qdisc(struct Qdisc_ops *qops)
138 {
139 	struct Qdisc_ops *q, **qp;
140 	int rc = -EEXIST;
141 
142 	write_lock(&qdisc_mod_lock);
143 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
144 		if (!strcmp(qops->id, q->id))
145 			goto out;
146 
147 	if (qops->enqueue == NULL)
148 		qops->enqueue = noop_qdisc_ops.enqueue;
149 	if (qops->peek == NULL) {
150 		if (qops->dequeue == NULL) {
151 			qops->peek = noop_qdisc_ops.peek;
152 		} else {
153 			rc = -EINVAL;
154 			goto out;
155 		}
156 	}
157 	if (qops->dequeue == NULL)
158 		qops->dequeue = noop_qdisc_ops.dequeue;
159 
160 	qops->next = NULL;
161 	*qp = qops;
162 	rc = 0;
163 out:
164 	write_unlock(&qdisc_mod_lock);
165 	return rc;
166 }
167 EXPORT_SYMBOL(register_qdisc);
168 
169 int unregister_qdisc(struct Qdisc_ops *qops)
170 {
171 	struct Qdisc_ops *q, **qp;
172 	int err = -ENOENT;
173 
174 	write_lock(&qdisc_mod_lock);
175 	for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
176 		if (q == qops)
177 			break;
178 	if (q) {
179 		*qp = q->next;
180 		q->next = NULL;
181 		err = 0;
182 	}
183 	write_unlock(&qdisc_mod_lock);
184 	return err;
185 }
186 EXPORT_SYMBOL(unregister_qdisc);
187 
188 /* We know handle. Find qdisc among all qdisc's attached to device
189    (root qdisc, all its children, children of children etc.)
190  */
191 
192 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
193 {
194 	struct Qdisc *q;
195 
196 	if (!(root->flags & TCQ_F_BUILTIN) &&
197 	    root->handle == handle)
198 		return root;
199 
200 	list_for_each_entry(q, &root->list, list) {
201 		if (q->handle == handle)
202 			return q;
203 	}
204 	return NULL;
205 }
206 
207 static void qdisc_list_add(struct Qdisc *q)
208 {
209 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
210 		list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
211 }
212 
213 void qdisc_list_del(struct Qdisc *q)
214 {
215 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
216 		list_del(&q->list);
217 }
218 EXPORT_SYMBOL(qdisc_list_del);
219 
220 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
221 {
222 	struct Qdisc *q;
223 
224 	q = qdisc_match_from_root(dev->qdisc, handle);
225 	if (q)
226 		goto out;
227 
228 	q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
229 out:
230 	return q;
231 }
232 
233 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
234 {
235 	unsigned long cl;
236 	struct Qdisc *leaf;
237 	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
238 
239 	if (cops == NULL)
240 		return NULL;
241 	cl = cops->get(p, classid);
242 
243 	if (cl == 0)
244 		return NULL;
245 	leaf = cops->leaf(p, cl);
246 	cops->put(p, cl);
247 	return leaf;
248 }
249 
250 /* Find queueing discipline by name */
251 
252 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
253 {
254 	struct Qdisc_ops *q = NULL;
255 
256 	if (kind) {
257 		read_lock(&qdisc_mod_lock);
258 		for (q = qdisc_base; q; q = q->next) {
259 			if (nla_strcmp(kind, q->id) == 0) {
260 				if (!try_module_get(q->owner))
261 					q = NULL;
262 				break;
263 			}
264 		}
265 		read_unlock(&qdisc_mod_lock);
266 	}
267 	return q;
268 }
269 
270 static struct qdisc_rate_table *qdisc_rtab_list;
271 
272 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
273 {
274 	struct qdisc_rate_table *rtab;
275 
276 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
277 		if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
278 			rtab->refcnt++;
279 			return rtab;
280 		}
281 	}
282 
283 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
284 	    nla_len(tab) != TC_RTAB_SIZE)
285 		return NULL;
286 
287 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
288 	if (rtab) {
289 		rtab->rate = *r;
290 		rtab->refcnt = 1;
291 		memcpy(rtab->data, nla_data(tab), 1024);
292 		rtab->next = qdisc_rtab_list;
293 		qdisc_rtab_list = rtab;
294 	}
295 	return rtab;
296 }
297 EXPORT_SYMBOL(qdisc_get_rtab);
298 
299 void qdisc_put_rtab(struct qdisc_rate_table *tab)
300 {
301 	struct qdisc_rate_table *rtab, **rtabp;
302 
303 	if (!tab || --tab->refcnt)
304 		return;
305 
306 	for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
307 		if (rtab == tab) {
308 			*rtabp = rtab->next;
309 			kfree(rtab);
310 			return;
311 		}
312 	}
313 }
314 EXPORT_SYMBOL(qdisc_put_rtab);
315 
316 static LIST_HEAD(qdisc_stab_list);
317 static DEFINE_SPINLOCK(qdisc_stab_lock);
318 
319 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
320 	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
321 	[TCA_STAB_DATA] = { .type = NLA_BINARY },
322 };
323 
324 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
325 {
326 	struct nlattr *tb[TCA_STAB_MAX + 1];
327 	struct qdisc_size_table *stab;
328 	struct tc_sizespec *s;
329 	unsigned int tsize = 0;
330 	u16 *tab = NULL;
331 	int err;
332 
333 	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
334 	if (err < 0)
335 		return ERR_PTR(err);
336 	if (!tb[TCA_STAB_BASE])
337 		return ERR_PTR(-EINVAL);
338 
339 	s = nla_data(tb[TCA_STAB_BASE]);
340 
341 	if (s->tsize > 0) {
342 		if (!tb[TCA_STAB_DATA])
343 			return ERR_PTR(-EINVAL);
344 		tab = nla_data(tb[TCA_STAB_DATA]);
345 		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
346 	}
347 
348 	if (!s || tsize != s->tsize || (!tab && tsize > 0))
349 		return ERR_PTR(-EINVAL);
350 
351 	spin_lock(&qdisc_stab_lock);
352 
353 	list_for_each_entry(stab, &qdisc_stab_list, list) {
354 		if (memcmp(&stab->szopts, s, sizeof(*s)))
355 			continue;
356 		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
357 			continue;
358 		stab->refcnt++;
359 		spin_unlock(&qdisc_stab_lock);
360 		return stab;
361 	}
362 
363 	spin_unlock(&qdisc_stab_lock);
364 
365 	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
366 	if (!stab)
367 		return ERR_PTR(-ENOMEM);
368 
369 	stab->refcnt = 1;
370 	stab->szopts = *s;
371 	if (tsize > 0)
372 		memcpy(stab->data, tab, tsize * sizeof(u16));
373 
374 	spin_lock(&qdisc_stab_lock);
375 	list_add_tail(&stab->list, &qdisc_stab_list);
376 	spin_unlock(&qdisc_stab_lock);
377 
378 	return stab;
379 }
380 
381 void qdisc_put_stab(struct qdisc_size_table *tab)
382 {
383 	if (!tab)
384 		return;
385 
386 	spin_lock(&qdisc_stab_lock);
387 
388 	if (--tab->refcnt == 0) {
389 		list_del(&tab->list);
390 		kfree(tab);
391 	}
392 
393 	spin_unlock(&qdisc_stab_lock);
394 }
395 EXPORT_SYMBOL(qdisc_put_stab);
396 
397 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
398 {
399 	struct nlattr *nest;
400 
401 	nest = nla_nest_start(skb, TCA_STAB);
402 	if (nest == NULL)
403 		goto nla_put_failure;
404 	NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
405 	nla_nest_end(skb, nest);
406 
407 	return skb->len;
408 
409 nla_put_failure:
410 	return -1;
411 }
412 
413 void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
414 {
415 	int pkt_len, slot;
416 
417 	pkt_len = skb->len + stab->szopts.overhead;
418 	if (unlikely(!stab->szopts.tsize))
419 		goto out;
420 
421 	slot = pkt_len + stab->szopts.cell_align;
422 	if (unlikely(slot < 0))
423 		slot = 0;
424 
425 	slot >>= stab->szopts.cell_log;
426 	if (likely(slot < stab->szopts.tsize))
427 		pkt_len = stab->data[slot];
428 	else
429 		pkt_len = stab->data[stab->szopts.tsize - 1] *
430 				(slot / stab->szopts.tsize) +
431 				stab->data[slot % stab->szopts.tsize];
432 
433 	pkt_len <<= stab->szopts.size_log;
434 out:
435 	if (unlikely(pkt_len < 1))
436 		pkt_len = 1;
437 	qdisc_skb_cb(skb)->pkt_len = pkt_len;
438 }
439 EXPORT_SYMBOL(qdisc_calculate_pkt_len);
440 
441 void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
442 {
443 	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
444 		printk(KERN_WARNING
445 		       "%s: %s qdisc %X: is non-work-conserving?\n",
446 		       txt, qdisc->ops->id, qdisc->handle >> 16);
447 		qdisc->flags |= TCQ_F_WARN_NONWC;
448 	}
449 }
450 EXPORT_SYMBOL(qdisc_warn_nonwc);
451 
452 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
453 {
454 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
455 						 timer);
456 
457 	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
458 	__netif_schedule(qdisc_root(wd->qdisc));
459 
460 	return HRTIMER_NORESTART;
461 }
462 
463 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
464 {
465 	hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
466 	wd->timer.function = qdisc_watchdog;
467 	wd->qdisc = qdisc;
468 }
469 EXPORT_SYMBOL(qdisc_watchdog_init);
470 
471 void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
472 {
473 	ktime_t time;
474 
475 	if (test_bit(__QDISC_STATE_DEACTIVATED,
476 		     &qdisc_root_sleeping(wd->qdisc)->state))
477 		return;
478 
479 	wd->qdisc->flags |= TCQ_F_THROTTLED;
480 	time = ktime_set(0, 0);
481 	time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
482 	hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
483 }
484 EXPORT_SYMBOL(qdisc_watchdog_schedule);
485 
486 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
487 {
488 	hrtimer_cancel(&wd->timer);
489 	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
490 }
491 EXPORT_SYMBOL(qdisc_watchdog_cancel);
492 
493 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
494 {
495 	unsigned int size = n * sizeof(struct hlist_head), i;
496 	struct hlist_head *h;
497 
498 	if (size <= PAGE_SIZE)
499 		h = kmalloc(size, GFP_KERNEL);
500 	else
501 		h = (struct hlist_head *)
502 			__get_free_pages(GFP_KERNEL, get_order(size));
503 
504 	if (h != NULL) {
505 		for (i = 0; i < n; i++)
506 			INIT_HLIST_HEAD(&h[i]);
507 	}
508 	return h;
509 }
510 
511 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
512 {
513 	unsigned int size = n * sizeof(struct hlist_head);
514 
515 	if (size <= PAGE_SIZE)
516 		kfree(h);
517 	else
518 		free_pages((unsigned long)h, get_order(size));
519 }
520 
521 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
522 {
523 	struct Qdisc_class_common *cl;
524 	struct hlist_node *n, *next;
525 	struct hlist_head *nhash, *ohash;
526 	unsigned int nsize, nmask, osize;
527 	unsigned int i, h;
528 
529 	/* Rehash when load factor exceeds 0.75 */
530 	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
531 		return;
532 	nsize = clhash->hashsize * 2;
533 	nmask = nsize - 1;
534 	nhash = qdisc_class_hash_alloc(nsize);
535 	if (nhash == NULL)
536 		return;
537 
538 	ohash = clhash->hash;
539 	osize = clhash->hashsize;
540 
541 	sch_tree_lock(sch);
542 	for (i = 0; i < osize; i++) {
543 		hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
544 			h = qdisc_class_hash(cl->classid, nmask);
545 			hlist_add_head(&cl->hnode, &nhash[h]);
546 		}
547 	}
548 	clhash->hash     = nhash;
549 	clhash->hashsize = nsize;
550 	clhash->hashmask = nmask;
551 	sch_tree_unlock(sch);
552 
553 	qdisc_class_hash_free(ohash, osize);
554 }
555 EXPORT_SYMBOL(qdisc_class_hash_grow);
556 
557 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
558 {
559 	unsigned int size = 4;
560 
561 	clhash->hash = qdisc_class_hash_alloc(size);
562 	if (clhash->hash == NULL)
563 		return -ENOMEM;
564 	clhash->hashsize  = size;
565 	clhash->hashmask  = size - 1;
566 	clhash->hashelems = 0;
567 	return 0;
568 }
569 EXPORT_SYMBOL(qdisc_class_hash_init);
570 
571 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
572 {
573 	qdisc_class_hash_free(clhash->hash, clhash->hashsize);
574 }
575 EXPORT_SYMBOL(qdisc_class_hash_destroy);
576 
577 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
578 			     struct Qdisc_class_common *cl)
579 {
580 	unsigned int h;
581 
582 	INIT_HLIST_NODE(&cl->hnode);
583 	h = qdisc_class_hash(cl->classid, clhash->hashmask);
584 	hlist_add_head(&cl->hnode, &clhash->hash[h]);
585 	clhash->hashelems++;
586 }
587 EXPORT_SYMBOL(qdisc_class_hash_insert);
588 
589 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
590 			     struct Qdisc_class_common *cl)
591 {
592 	hlist_del(&cl->hnode);
593 	clhash->hashelems--;
594 }
595 EXPORT_SYMBOL(qdisc_class_hash_remove);
596 
597 /* Allocate an unique handle from space managed by kernel */
598 
599 static u32 qdisc_alloc_handle(struct net_device *dev)
600 {
601 	int i = 0x10000;
602 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
603 
604 	do {
605 		autohandle += TC_H_MAKE(0x10000U, 0);
606 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
607 			autohandle = TC_H_MAKE(0x80000000U, 0);
608 	} while	(qdisc_lookup(dev, autohandle) && --i > 0);
609 
610 	return i>0 ? autohandle : 0;
611 }
612 
613 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
614 {
615 	const struct Qdisc_class_ops *cops;
616 	unsigned long cl;
617 	u32 parentid;
618 
619 	if (n == 0)
620 		return;
621 	while ((parentid = sch->parent)) {
622 		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
623 			return;
624 
625 		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
626 		if (sch == NULL) {
627 			WARN_ON(parentid != TC_H_ROOT);
628 			return;
629 		}
630 		cops = sch->ops->cl_ops;
631 		if (cops->qlen_notify) {
632 			cl = cops->get(sch, parentid);
633 			cops->qlen_notify(sch, cl);
634 			cops->put(sch, cl);
635 		}
636 		sch->q.qlen -= n;
637 	}
638 }
639 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
640 
641 static void notify_and_destroy(struct sk_buff *skb, struct nlmsghdr *n, u32 clid,
642 			       struct Qdisc *old, struct Qdisc *new)
643 {
644 	if (new || old)
645 		qdisc_notify(skb, n, clid, old, new);
646 
647 	if (old)
648 		qdisc_destroy(old);
649 }
650 
651 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
652  * to device "dev".
653  *
654  * When appropriate send a netlink notification using 'skb'
655  * and "n".
656  *
657  * On success, destroy old qdisc.
658  */
659 
660 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
661 		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
662 		       struct Qdisc *new, struct Qdisc *old)
663 {
664 	struct Qdisc *q = old;
665 	int err = 0;
666 
667 	if (parent == NULL) {
668 		unsigned int i, num_q, ingress;
669 
670 		ingress = 0;
671 		num_q = dev->num_tx_queues;
672 		if ((q && q->flags & TCQ_F_INGRESS) ||
673 		    (new && new->flags & TCQ_F_INGRESS)) {
674 			num_q = 1;
675 			ingress = 1;
676 		}
677 
678 		if (dev->flags & IFF_UP)
679 			dev_deactivate(dev);
680 
681 		if (new && new->ops->attach) {
682 			new->ops->attach(new);
683 			num_q = 0;
684 		}
685 
686 		for (i = 0; i < num_q; i++) {
687 			struct netdev_queue *dev_queue = &dev->rx_queue;
688 
689 			if (!ingress)
690 				dev_queue = netdev_get_tx_queue(dev, i);
691 
692 			old = dev_graft_qdisc(dev_queue, new);
693 			if (new && i > 0)
694 				atomic_inc(&new->refcnt);
695 
696 			qdisc_destroy(old);
697 		}
698 
699 		notify_and_destroy(skb, n, classid, dev->qdisc, new);
700 		if (new && !new->ops->attach)
701 			atomic_inc(&new->refcnt);
702 		dev->qdisc = new ? : &noop_qdisc;
703 
704 		if (dev->flags & IFF_UP)
705 			dev_activate(dev);
706 	} else {
707 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
708 
709 		err = -EOPNOTSUPP;
710 		if (cops && cops->graft) {
711 			unsigned long cl = cops->get(parent, classid);
712 			if (cl) {
713 				err = cops->graft(parent, cl, new, &old);
714 				cops->put(parent, cl);
715 			} else
716 				err = -ENOENT;
717 		}
718 		if (!err)
719 			notify_and_destroy(skb, n, classid, old, new);
720 	}
721 	return err;
722 }
723 
724 /* lockdep annotation is needed for ingress; egress gets it only for name */
725 static struct lock_class_key qdisc_tx_lock;
726 static struct lock_class_key qdisc_rx_lock;
727 
728 /*
729    Allocate and initialize new qdisc.
730 
731    Parameters are passed via opt.
732  */
733 
734 static struct Qdisc *
735 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
736 	     struct Qdisc *p, u32 parent, u32 handle,
737 	     struct nlattr **tca, int *errp)
738 {
739 	int err;
740 	struct nlattr *kind = tca[TCA_KIND];
741 	struct Qdisc *sch;
742 	struct Qdisc_ops *ops;
743 	struct qdisc_size_table *stab;
744 
745 	ops = qdisc_lookup_ops(kind);
746 #ifdef CONFIG_MODULES
747 	if (ops == NULL && kind != NULL) {
748 		char name[IFNAMSIZ];
749 		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
750 			/* We dropped the RTNL semaphore in order to
751 			 * perform the module load.  So, even if we
752 			 * succeeded in loading the module we have to
753 			 * tell the caller to replay the request.  We
754 			 * indicate this using -EAGAIN.
755 			 * We replay the request because the device may
756 			 * go away in the mean time.
757 			 */
758 			rtnl_unlock();
759 			request_module("sch_%s", name);
760 			rtnl_lock();
761 			ops = qdisc_lookup_ops(kind);
762 			if (ops != NULL) {
763 				/* We will try again qdisc_lookup_ops,
764 				 * so don't keep a reference.
765 				 */
766 				module_put(ops->owner);
767 				err = -EAGAIN;
768 				goto err_out;
769 			}
770 		}
771 	}
772 #endif
773 
774 	err = -ENOENT;
775 	if (ops == NULL)
776 		goto err_out;
777 
778 	sch = qdisc_alloc(dev_queue, ops);
779 	if (IS_ERR(sch)) {
780 		err = PTR_ERR(sch);
781 		goto err_out2;
782 	}
783 
784 	sch->parent = parent;
785 
786 	if (handle == TC_H_INGRESS) {
787 		sch->flags |= TCQ_F_INGRESS;
788 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
789 		lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
790 	} else {
791 		if (handle == 0) {
792 			handle = qdisc_alloc_handle(dev);
793 			err = -ENOMEM;
794 			if (handle == 0)
795 				goto err_out3;
796 		}
797 		lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
798 	}
799 
800 	sch->handle = handle;
801 
802 	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
803 		if (tca[TCA_STAB]) {
804 			stab = qdisc_get_stab(tca[TCA_STAB]);
805 			if (IS_ERR(stab)) {
806 				err = PTR_ERR(stab);
807 				goto err_out3;
808 			}
809 			sch->stab = stab;
810 		}
811 		if (tca[TCA_RATE]) {
812 			spinlock_t *root_lock;
813 
814 			err = -EOPNOTSUPP;
815 			if (sch->flags & TCQ_F_MQROOT)
816 				goto err_out4;
817 
818 			if ((sch->parent != TC_H_ROOT) &&
819 			    !(sch->flags & TCQ_F_INGRESS) &&
820 			    (!p || !(p->flags & TCQ_F_MQROOT)))
821 				root_lock = qdisc_root_sleeping_lock(sch);
822 			else
823 				root_lock = qdisc_lock(sch);
824 
825 			err = gen_new_estimator(&sch->bstats, &sch->rate_est,
826 						root_lock, tca[TCA_RATE]);
827 			if (err)
828 				goto err_out4;
829 		}
830 
831 		qdisc_list_add(sch);
832 
833 		return sch;
834 	}
835 err_out3:
836 	qdisc_put_stab(sch->stab);
837 	dev_put(dev);
838 	kfree((char *) sch - sch->padded);
839 err_out2:
840 	module_put(ops->owner);
841 err_out:
842 	*errp = err;
843 	return NULL;
844 
845 err_out4:
846 	/*
847 	 * Any broken qdiscs that would require a ops->reset() here?
848 	 * The qdisc was never in action so it shouldn't be necessary.
849 	 */
850 	if (ops->destroy)
851 		ops->destroy(sch);
852 	goto err_out3;
853 }
854 
855 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
856 {
857 	struct qdisc_size_table *stab = NULL;
858 	int err = 0;
859 
860 	if (tca[TCA_OPTIONS]) {
861 		if (sch->ops->change == NULL)
862 			return -EINVAL;
863 		err = sch->ops->change(sch, tca[TCA_OPTIONS]);
864 		if (err)
865 			return err;
866 	}
867 
868 	if (tca[TCA_STAB]) {
869 		stab = qdisc_get_stab(tca[TCA_STAB]);
870 		if (IS_ERR(stab))
871 			return PTR_ERR(stab);
872 	}
873 
874 	qdisc_put_stab(sch->stab);
875 	sch->stab = stab;
876 
877 	if (tca[TCA_RATE]) {
878 		/* NB: ignores errors from replace_estimator
879 		   because change can't be undone. */
880 		if (sch->flags & TCQ_F_MQROOT)
881 			goto out;
882 		gen_replace_estimator(&sch->bstats, &sch->rate_est,
883 					    qdisc_root_sleeping_lock(sch),
884 					    tca[TCA_RATE]);
885 	}
886 out:
887 	return 0;
888 }
889 
890 struct check_loop_arg
891 {
892 	struct qdisc_walker 	w;
893 	struct Qdisc		*p;
894 	int			depth;
895 };
896 
897 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
898 
899 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
900 {
901 	struct check_loop_arg	arg;
902 
903 	if (q->ops->cl_ops == NULL)
904 		return 0;
905 
906 	arg.w.stop = arg.w.skip = arg.w.count = 0;
907 	arg.w.fn = check_loop_fn;
908 	arg.depth = depth;
909 	arg.p = p;
910 	q->ops->cl_ops->walk(q, &arg.w);
911 	return arg.w.stop ? -ELOOP : 0;
912 }
913 
914 static int
915 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
916 {
917 	struct Qdisc *leaf;
918 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
919 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
920 
921 	leaf = cops->leaf(q, cl);
922 	if (leaf) {
923 		if (leaf == arg->p || arg->depth > 7)
924 			return -ELOOP;
925 		return check_loop(leaf, arg->p, arg->depth + 1);
926 	}
927 	return 0;
928 }
929 
930 /*
931  * Delete/get qdisc.
932  */
933 
934 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
935 {
936 	struct net *net = sock_net(skb->sk);
937 	struct tcmsg *tcm = NLMSG_DATA(n);
938 	struct nlattr *tca[TCA_MAX + 1];
939 	struct net_device *dev;
940 	u32 clid = tcm->tcm_parent;
941 	struct Qdisc *q = NULL;
942 	struct Qdisc *p = NULL;
943 	int err;
944 
945 	if (net != &init_net)
946 		return -EINVAL;
947 
948 	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
949 		return -ENODEV;
950 
951 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
952 	if (err < 0)
953 		return err;
954 
955 	if (clid) {
956 		if (clid != TC_H_ROOT) {
957 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
958 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
959 					return -ENOENT;
960 				q = qdisc_leaf(p, clid);
961 			} else { /* ingress */
962 				q = dev->rx_queue.qdisc_sleeping;
963 			}
964 		} else {
965 			q = dev->qdisc;
966 		}
967 		if (!q)
968 			return -ENOENT;
969 
970 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
971 			return -EINVAL;
972 	} else {
973 		if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
974 			return -ENOENT;
975 	}
976 
977 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
978 		return -EINVAL;
979 
980 	if (n->nlmsg_type == RTM_DELQDISC) {
981 		if (!clid)
982 			return -EINVAL;
983 		if (q->handle == 0)
984 			return -ENOENT;
985 		if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
986 			return err;
987 	} else {
988 		qdisc_notify(skb, n, clid, NULL, q);
989 	}
990 	return 0;
991 }
992 
993 /*
994    Create/change qdisc.
995  */
996 
997 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
998 {
999 	struct net *net = sock_net(skb->sk);
1000 	struct tcmsg *tcm;
1001 	struct nlattr *tca[TCA_MAX + 1];
1002 	struct net_device *dev;
1003 	u32 clid;
1004 	struct Qdisc *q, *p;
1005 	int err;
1006 
1007 	if (net != &init_net)
1008 		return -EINVAL;
1009 
1010 replay:
1011 	/* Reinit, just in case something touches this. */
1012 	tcm = NLMSG_DATA(n);
1013 	clid = tcm->tcm_parent;
1014 	q = p = NULL;
1015 
1016 	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1017 		return -ENODEV;
1018 
1019 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1020 	if (err < 0)
1021 		return err;
1022 
1023 	if (clid) {
1024 		if (clid != TC_H_ROOT) {
1025 			if (clid != TC_H_INGRESS) {
1026 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
1027 					return -ENOENT;
1028 				q = qdisc_leaf(p, clid);
1029 			} else { /*ingress */
1030 				q = dev->rx_queue.qdisc_sleeping;
1031 			}
1032 		} else {
1033 			q = dev->qdisc;
1034 		}
1035 
1036 		/* It may be default qdisc, ignore it */
1037 		if (q && q->handle == 0)
1038 			q = NULL;
1039 
1040 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1041 			if (tcm->tcm_handle) {
1042 				if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
1043 					return -EEXIST;
1044 				if (TC_H_MIN(tcm->tcm_handle))
1045 					return -EINVAL;
1046 				if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1047 					goto create_n_graft;
1048 				if (n->nlmsg_flags&NLM_F_EXCL)
1049 					return -EEXIST;
1050 				if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1051 					return -EINVAL;
1052 				if (q == p ||
1053 				    (p && check_loop(q, p, 0)))
1054 					return -ELOOP;
1055 				atomic_inc(&q->refcnt);
1056 				goto graft;
1057 			} else {
1058 				if (q == NULL)
1059 					goto create_n_graft;
1060 
1061 				/* This magic test requires explanation.
1062 				 *
1063 				 *   We know, that some child q is already
1064 				 *   attached to this parent and have choice:
1065 				 *   either to change it or to create/graft new one.
1066 				 *
1067 				 *   1. We are allowed to create/graft only
1068 				 *   if CREATE and REPLACE flags are set.
1069 				 *
1070 				 *   2. If EXCL is set, requestor wanted to say,
1071 				 *   that qdisc tcm_handle is not expected
1072 				 *   to exist, so that we choose create/graft too.
1073 				 *
1074 				 *   3. The last case is when no flags are set.
1075 				 *   Alas, it is sort of hole in API, we
1076 				 *   cannot decide what to do unambiguously.
1077 				 *   For now we select create/graft, if
1078 				 *   user gave KIND, which does not match existing.
1079 				 */
1080 				if ((n->nlmsg_flags&NLM_F_CREATE) &&
1081 				    (n->nlmsg_flags&NLM_F_REPLACE) &&
1082 				    ((n->nlmsg_flags&NLM_F_EXCL) ||
1083 				     (tca[TCA_KIND] &&
1084 				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1085 					goto create_n_graft;
1086 			}
1087 		}
1088 	} else {
1089 		if (!tcm->tcm_handle)
1090 			return -EINVAL;
1091 		q = qdisc_lookup(dev, tcm->tcm_handle);
1092 	}
1093 
1094 	/* Change qdisc parameters */
1095 	if (q == NULL)
1096 		return -ENOENT;
1097 	if (n->nlmsg_flags&NLM_F_EXCL)
1098 		return -EEXIST;
1099 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1100 		return -EINVAL;
1101 	err = qdisc_change(q, tca);
1102 	if (err == 0)
1103 		qdisc_notify(skb, n, clid, NULL, q);
1104 	return err;
1105 
1106 create_n_graft:
1107 	if (!(n->nlmsg_flags&NLM_F_CREATE))
1108 		return -ENOENT;
1109 	if (clid == TC_H_INGRESS)
1110 		q = qdisc_create(dev, &dev->rx_queue, p,
1111 				 tcm->tcm_parent, tcm->tcm_parent,
1112 				 tca, &err);
1113 	else {
1114 		unsigned int ntx = 0;
1115 
1116 		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1117 			ntx = p->ops->cl_ops->select_queue(p, tcm);
1118 
1119 		q = qdisc_create(dev, netdev_get_tx_queue(dev, ntx), p,
1120 				 tcm->tcm_parent, tcm->tcm_handle,
1121 				 tca, &err);
1122 	}
1123 	if (q == NULL) {
1124 		if (err == -EAGAIN)
1125 			goto replay;
1126 		return err;
1127 	}
1128 
1129 graft:
1130 	err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1131 	if (err) {
1132 		if (q)
1133 			qdisc_destroy(q);
1134 		return err;
1135 	}
1136 
1137 	return 0;
1138 }
1139 
1140 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1141 			 u32 pid, u32 seq, u16 flags, int event)
1142 {
1143 	struct tcmsg *tcm;
1144 	struct nlmsghdr  *nlh;
1145 	unsigned char *b = skb_tail_pointer(skb);
1146 	struct gnet_dump d;
1147 
1148 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1149 	tcm = NLMSG_DATA(nlh);
1150 	tcm->tcm_family = AF_UNSPEC;
1151 	tcm->tcm__pad1 = 0;
1152 	tcm->tcm__pad2 = 0;
1153 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1154 	tcm->tcm_parent = clid;
1155 	tcm->tcm_handle = q->handle;
1156 	tcm->tcm_info = atomic_read(&q->refcnt);
1157 	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1158 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
1159 		goto nla_put_failure;
1160 	q->qstats.qlen = q->q.qlen;
1161 
1162 	if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
1163 		goto nla_put_failure;
1164 
1165 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1166 					 qdisc_root_sleeping_lock(q), &d) < 0)
1167 		goto nla_put_failure;
1168 
1169 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1170 		goto nla_put_failure;
1171 
1172 	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1173 	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
1174 	    gnet_stats_copy_queue(&d, &q->qstats) < 0)
1175 		goto nla_put_failure;
1176 
1177 	if (gnet_stats_finish_copy(&d) < 0)
1178 		goto nla_put_failure;
1179 
1180 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1181 	return skb->len;
1182 
1183 nlmsg_failure:
1184 nla_put_failure:
1185 	nlmsg_trim(skb, b);
1186 	return -1;
1187 }
1188 
1189 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1190 			u32 clid, struct Qdisc *old, struct Qdisc *new)
1191 {
1192 	struct sk_buff *skb;
1193 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1194 
1195 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1196 	if (!skb)
1197 		return -ENOBUFS;
1198 
1199 	if (old && old->handle) {
1200 		if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1201 			goto err_out;
1202 	}
1203 	if (new) {
1204 		if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1205 			goto err_out;
1206 	}
1207 
1208 	if (skb->len)
1209 		return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1210 
1211 err_out:
1212 	kfree_skb(skb);
1213 	return -EINVAL;
1214 }
1215 
1216 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1217 {
1218 	return (q->flags & TCQ_F_BUILTIN) ? true : false;
1219 }
1220 
1221 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1222 			      struct netlink_callback *cb,
1223 			      int *q_idx_p, int s_q_idx)
1224 {
1225 	int ret = 0, q_idx = *q_idx_p;
1226 	struct Qdisc *q;
1227 
1228 	if (!root)
1229 		return 0;
1230 
1231 	q = root;
1232 	if (q_idx < s_q_idx) {
1233 		q_idx++;
1234 	} else {
1235 		if (!tc_qdisc_dump_ignore(q) &&
1236 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1237 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1238 			goto done;
1239 		q_idx++;
1240 	}
1241 	list_for_each_entry(q, &root->list, list) {
1242 		if (q_idx < s_q_idx) {
1243 			q_idx++;
1244 			continue;
1245 		}
1246 		if (!tc_qdisc_dump_ignore(q) &&
1247 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1248 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1249 			goto done;
1250 		q_idx++;
1251 	}
1252 
1253 out:
1254 	*q_idx_p = q_idx;
1255 	return ret;
1256 done:
1257 	ret = -1;
1258 	goto out;
1259 }
1260 
1261 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1262 {
1263 	struct net *net = sock_net(skb->sk);
1264 	int idx, q_idx;
1265 	int s_idx, s_q_idx;
1266 	struct net_device *dev;
1267 
1268 	if (net != &init_net)
1269 		return 0;
1270 
1271 	s_idx = cb->args[0];
1272 	s_q_idx = q_idx = cb->args[1];
1273 	read_lock(&dev_base_lock);
1274 	idx = 0;
1275 	for_each_netdev(&init_net, dev) {
1276 		struct netdev_queue *dev_queue;
1277 
1278 		if (idx < s_idx)
1279 			goto cont;
1280 		if (idx > s_idx)
1281 			s_q_idx = 0;
1282 		q_idx = 0;
1283 
1284 		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1285 			goto done;
1286 
1287 		dev_queue = &dev->rx_queue;
1288 		if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1289 			goto done;
1290 
1291 cont:
1292 		idx++;
1293 	}
1294 
1295 done:
1296 	read_unlock(&dev_base_lock);
1297 
1298 	cb->args[0] = idx;
1299 	cb->args[1] = q_idx;
1300 
1301 	return skb->len;
1302 }
1303 
1304 
1305 
1306 /************************************************
1307  *	Traffic classes manipulation.		*
1308  ************************************************/
1309 
1310 
1311 
1312 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1313 {
1314 	struct net *net = sock_net(skb->sk);
1315 	struct tcmsg *tcm = NLMSG_DATA(n);
1316 	struct nlattr *tca[TCA_MAX + 1];
1317 	struct net_device *dev;
1318 	struct Qdisc *q = NULL;
1319 	const struct Qdisc_class_ops *cops;
1320 	unsigned long cl = 0;
1321 	unsigned long new_cl;
1322 	u32 pid = tcm->tcm_parent;
1323 	u32 clid = tcm->tcm_handle;
1324 	u32 qid = TC_H_MAJ(clid);
1325 	int err;
1326 
1327 	if (net != &init_net)
1328 		return -EINVAL;
1329 
1330 	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1331 		return -ENODEV;
1332 
1333 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1334 	if (err < 0)
1335 		return err;
1336 
1337 	/*
1338 	   parent == TC_H_UNSPEC - unspecified parent.
1339 	   parent == TC_H_ROOT   - class is root, which has no parent.
1340 	   parent == X:0	 - parent is root class.
1341 	   parent == X:Y	 - parent is a node in hierarchy.
1342 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
1343 
1344 	   handle == 0:0	 - generate handle from kernel pool.
1345 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
1346 	   handle == X:Y	 - clear.
1347 	   handle == X:0	 - root class.
1348 	 */
1349 
1350 	/* Step 1. Determine qdisc handle X:0 */
1351 
1352 	if (pid != TC_H_ROOT) {
1353 		u32 qid1 = TC_H_MAJ(pid);
1354 
1355 		if (qid && qid1) {
1356 			/* If both majors are known, they must be identical. */
1357 			if (qid != qid1)
1358 				return -EINVAL;
1359 		} else if (qid1) {
1360 			qid = qid1;
1361 		} else if (qid == 0)
1362 			qid = dev->qdisc->handle;
1363 
1364 		/* Now qid is genuine qdisc handle consistent
1365 		   both with parent and child.
1366 
1367 		   TC_H_MAJ(pid) still may be unspecified, complete it now.
1368 		 */
1369 		if (pid)
1370 			pid = TC_H_MAKE(qid, pid);
1371 	} else {
1372 		if (qid == 0)
1373 			qid = dev->qdisc->handle;
1374 	}
1375 
1376 	/* OK. Locate qdisc */
1377 	if ((q = qdisc_lookup(dev, qid)) == NULL)
1378 		return -ENOENT;
1379 
1380 	/* An check that it supports classes */
1381 	cops = q->ops->cl_ops;
1382 	if (cops == NULL)
1383 		return -EINVAL;
1384 
1385 	/* Now try to get class */
1386 	if (clid == 0) {
1387 		if (pid == TC_H_ROOT)
1388 			clid = qid;
1389 	} else
1390 		clid = TC_H_MAKE(qid, clid);
1391 
1392 	if (clid)
1393 		cl = cops->get(q, clid);
1394 
1395 	if (cl == 0) {
1396 		err = -ENOENT;
1397 		if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1398 			goto out;
1399 	} else {
1400 		switch (n->nlmsg_type) {
1401 		case RTM_NEWTCLASS:
1402 			err = -EEXIST;
1403 			if (n->nlmsg_flags&NLM_F_EXCL)
1404 				goto out;
1405 			break;
1406 		case RTM_DELTCLASS:
1407 			err = -EOPNOTSUPP;
1408 			if (cops->delete)
1409 				err = cops->delete(q, cl);
1410 			if (err == 0)
1411 				tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1412 			goto out;
1413 		case RTM_GETTCLASS:
1414 			err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1415 			goto out;
1416 		default:
1417 			err = -EINVAL;
1418 			goto out;
1419 		}
1420 	}
1421 
1422 	new_cl = cl;
1423 	err = -EOPNOTSUPP;
1424 	if (cops->change)
1425 		err = cops->change(q, clid, pid, tca, &new_cl);
1426 	if (err == 0)
1427 		tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1428 
1429 out:
1430 	if (cl)
1431 		cops->put(q, cl);
1432 
1433 	return err;
1434 }
1435 
1436 
1437 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1438 			  unsigned long cl,
1439 			  u32 pid, u32 seq, u16 flags, int event)
1440 {
1441 	struct tcmsg *tcm;
1442 	struct nlmsghdr  *nlh;
1443 	unsigned char *b = skb_tail_pointer(skb);
1444 	struct gnet_dump d;
1445 	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1446 
1447 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1448 	tcm = NLMSG_DATA(nlh);
1449 	tcm->tcm_family = AF_UNSPEC;
1450 	tcm->tcm__pad1 = 0;
1451 	tcm->tcm__pad2 = 0;
1452 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1453 	tcm->tcm_parent = q->handle;
1454 	tcm->tcm_handle = q->handle;
1455 	tcm->tcm_info = 0;
1456 	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1457 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1458 		goto nla_put_failure;
1459 
1460 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1461 					 qdisc_root_sleeping_lock(q), &d) < 0)
1462 		goto nla_put_failure;
1463 
1464 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1465 		goto nla_put_failure;
1466 
1467 	if (gnet_stats_finish_copy(&d) < 0)
1468 		goto nla_put_failure;
1469 
1470 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1471 	return skb->len;
1472 
1473 nlmsg_failure:
1474 nla_put_failure:
1475 	nlmsg_trim(skb, b);
1476 	return -1;
1477 }
1478 
1479 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1480 			  struct Qdisc *q, unsigned long cl, int event)
1481 {
1482 	struct sk_buff *skb;
1483 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1484 
1485 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1486 	if (!skb)
1487 		return -ENOBUFS;
1488 
1489 	if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1490 		kfree_skb(skb);
1491 		return -EINVAL;
1492 	}
1493 
1494 	return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1495 }
1496 
1497 struct qdisc_dump_args
1498 {
1499 	struct qdisc_walker w;
1500 	struct sk_buff *skb;
1501 	struct netlink_callback *cb;
1502 };
1503 
1504 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1505 {
1506 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1507 
1508 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1509 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1510 }
1511 
1512 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1513 				struct tcmsg *tcm, struct netlink_callback *cb,
1514 				int *t_p, int s_t)
1515 {
1516 	struct qdisc_dump_args arg;
1517 
1518 	if (tc_qdisc_dump_ignore(q) ||
1519 	    *t_p < s_t || !q->ops->cl_ops ||
1520 	    (tcm->tcm_parent &&
1521 	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1522 		(*t_p)++;
1523 		return 0;
1524 	}
1525 	if (*t_p > s_t)
1526 		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1527 	arg.w.fn = qdisc_class_dump;
1528 	arg.skb = skb;
1529 	arg.cb = cb;
1530 	arg.w.stop  = 0;
1531 	arg.w.skip = cb->args[1];
1532 	arg.w.count = 0;
1533 	q->ops->cl_ops->walk(q, &arg.w);
1534 	cb->args[1] = arg.w.count;
1535 	if (arg.w.stop)
1536 		return -1;
1537 	(*t_p)++;
1538 	return 0;
1539 }
1540 
1541 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1542 			       struct tcmsg *tcm, struct netlink_callback *cb,
1543 			       int *t_p, int s_t)
1544 {
1545 	struct Qdisc *q;
1546 
1547 	if (!root)
1548 		return 0;
1549 
1550 	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1551 		return -1;
1552 
1553 	list_for_each_entry(q, &root->list, list) {
1554 		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1555 			return -1;
1556 	}
1557 
1558 	return 0;
1559 }
1560 
1561 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1562 {
1563 	struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1564 	struct net *net = sock_net(skb->sk);
1565 	struct netdev_queue *dev_queue;
1566 	struct net_device *dev;
1567 	int t, s_t;
1568 
1569 	if (net != &init_net)
1570 		return 0;
1571 
1572 	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1573 		return 0;
1574 	if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1575 		return 0;
1576 
1577 	s_t = cb->args[0];
1578 	t = 0;
1579 
1580 	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1581 		goto done;
1582 
1583 	dev_queue = &dev->rx_queue;
1584 	if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1585 		goto done;
1586 
1587 done:
1588 	cb->args[0] = t;
1589 
1590 	dev_put(dev);
1591 	return skb->len;
1592 }
1593 
1594 /* Main classifier routine: scans classifier chain attached
1595    to this qdisc, (optionally) tests for protocol and asks
1596    specific classifiers.
1597  */
1598 int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1599 		       struct tcf_result *res)
1600 {
1601 	__be16 protocol = skb->protocol;
1602 	int err = 0;
1603 
1604 	for (; tp; tp = tp->next) {
1605 		if ((tp->protocol == protocol ||
1606 		     tp->protocol == htons(ETH_P_ALL)) &&
1607 		    (err = tp->classify(skb, tp, res)) >= 0) {
1608 #ifdef CONFIG_NET_CLS_ACT
1609 			if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1610 				skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1611 #endif
1612 			return err;
1613 		}
1614 	}
1615 	return -1;
1616 }
1617 EXPORT_SYMBOL(tc_classify_compat);
1618 
1619 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1620 		struct tcf_result *res)
1621 {
1622 	int err = 0;
1623 	__be16 protocol;
1624 #ifdef CONFIG_NET_CLS_ACT
1625 	struct tcf_proto *otp = tp;
1626 reclassify:
1627 #endif
1628 	protocol = skb->protocol;
1629 
1630 	err = tc_classify_compat(skb, tp, res);
1631 #ifdef CONFIG_NET_CLS_ACT
1632 	if (err == TC_ACT_RECLASSIFY) {
1633 		u32 verd = G_TC_VERD(skb->tc_verd);
1634 		tp = otp;
1635 
1636 		if (verd++ >= MAX_REC_LOOP) {
1637 			printk("rule prio %u protocol %02x reclassify loop, "
1638 			       "packet dropped\n",
1639 			       tp->prio&0xffff, ntohs(tp->protocol));
1640 			return TC_ACT_SHOT;
1641 		}
1642 		skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1643 		goto reclassify;
1644 	}
1645 #endif
1646 	return err;
1647 }
1648 EXPORT_SYMBOL(tc_classify);
1649 
1650 void tcf_destroy(struct tcf_proto *tp)
1651 {
1652 	tp->ops->destroy(tp);
1653 	module_put(tp->ops->owner);
1654 	kfree(tp);
1655 }
1656 
1657 void tcf_destroy_chain(struct tcf_proto **fl)
1658 {
1659 	struct tcf_proto *tp;
1660 
1661 	while ((tp = *fl) != NULL) {
1662 		*fl = tp->next;
1663 		tcf_destroy(tp);
1664 	}
1665 }
1666 EXPORT_SYMBOL(tcf_destroy_chain);
1667 
1668 #ifdef CONFIG_PROC_FS
1669 static int psched_show(struct seq_file *seq, void *v)
1670 {
1671 	struct timespec ts;
1672 
1673 	hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1674 	seq_printf(seq, "%08x %08x %08x %08x\n",
1675 		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1676 		   1000000,
1677 		   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1678 
1679 	return 0;
1680 }
1681 
1682 static int psched_open(struct inode *inode, struct file *file)
1683 {
1684 	return single_open(file, psched_show, PDE(inode)->data);
1685 }
1686 
1687 static const struct file_operations psched_fops = {
1688 	.owner = THIS_MODULE,
1689 	.open = psched_open,
1690 	.read  = seq_read,
1691 	.llseek = seq_lseek,
1692 	.release = single_release,
1693 };
1694 #endif
1695 
1696 static int __init pktsched_init(void)
1697 {
1698 	register_qdisc(&pfifo_qdisc_ops);
1699 	register_qdisc(&bfifo_qdisc_ops);
1700 	register_qdisc(&mq_qdisc_ops);
1701 	proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
1702 
1703 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1704 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1705 	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1706 	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1707 	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1708 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1709 
1710 	return 0;
1711 }
1712 
1713 subsys_initcall(pktsched_init);
1714