xref: /linux/net/sched/sch_api.c (revision a33f32244d8550da8b4a26e277ce07d5c6d158b5)
1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32 
33 #include <net/net_namespace.h>
34 #include <net/sock.h>
35 #include <net/netlink.h>
36 #include <net/pkt_sched.h>
37 
38 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
39 			struct Qdisc *old, struct Qdisc *new);
40 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
41 			 struct Qdisc *q, unsigned long cl, int event);
42 
43 /*
44 
45    Short review.
46    -------------
47 
48    This file consists of two interrelated parts:
49 
50    1. queueing disciplines manager frontend.
51    2. traffic classes manager frontend.
52 
53    Generally, queueing discipline ("qdisc") is a black box,
54    which is able to enqueue packets and to dequeue them (when
55    device is ready to send something) in order and at times
56    determined by algorithm hidden in it.
57 
58    qdisc's are divided to two categories:
59    - "queues", which have no internal structure visible from outside.
60    - "schedulers", which split all the packets to "traffic classes",
61      using "packet classifiers" (look at cls_api.c)
62 
63    In turn, classes may have child qdiscs (as rule, queues)
64    attached to them etc. etc. etc.
65 
66    The goal of the routines in this file is to translate
67    information supplied by user in the form of handles
68    to more intelligible for kernel form, to make some sanity
69    checks and part of work, which is common to all qdiscs
70    and to provide rtnetlink notifications.
71 
72    All real intelligent work is done inside qdisc modules.
73 
74 
75 
76    Every discipline has two major routines: enqueue and dequeue.
77 
78    ---dequeue
79 
80    dequeue usually returns a skb to send. It is allowed to return NULL,
81    but it does not mean that queue is empty, it just means that
82    discipline does not want to send anything this time.
83    Queue is really empty if q->q.qlen == 0.
84    For complicated disciplines with multiple queues q->q is not
85    real packet queue, but however q->q.qlen must be valid.
86 
87    ---enqueue
88 
89    enqueue returns 0, if packet was enqueued successfully.
90    If packet (this one or another one) was dropped, it returns
91    not zero error code.
92    NET_XMIT_DROP 	- this packet dropped
93      Expected action: do not backoff, but wait until queue will clear.
94    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
95      Expected action: backoff or ignore
96    NET_XMIT_POLICED	- dropped by police.
97      Expected action: backoff or error to real-time apps.
98 
99    Auxiliary routines:
100 
101    ---peek
102 
103    like dequeue but without removing a packet from the queue
104 
105    ---reset
106 
107    returns qdisc to initial state: purge all buffers, clear all
108    timers, counters (except for statistics) etc.
109 
110    ---init
111 
112    initializes newly created qdisc.
113 
114    ---destroy
115 
116    destroys resources allocated by init and during lifetime of qdisc.
117 
118    ---change
119 
120    changes qdisc parameters.
121  */
122 
123 /* Protects list of registered TC modules. It is pure SMP lock. */
124 static DEFINE_RWLOCK(qdisc_mod_lock);
125 
126 
127 /************************************************
128  *	Queueing disciplines manipulation.	*
129  ************************************************/
130 
131 
132 /* The list of all installed queueing disciplines. */
133 
134 static struct Qdisc_ops *qdisc_base;
135 
136 /* Register/uregister queueing discipline */
137 
138 int register_qdisc(struct Qdisc_ops *qops)
139 {
140 	struct Qdisc_ops *q, **qp;
141 	int rc = -EEXIST;
142 
143 	write_lock(&qdisc_mod_lock);
144 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
145 		if (!strcmp(qops->id, q->id))
146 			goto out;
147 
148 	if (qops->enqueue == NULL)
149 		qops->enqueue = noop_qdisc_ops.enqueue;
150 	if (qops->peek == NULL) {
151 		if (qops->dequeue == NULL) {
152 			qops->peek = noop_qdisc_ops.peek;
153 		} else {
154 			rc = -EINVAL;
155 			goto out;
156 		}
157 	}
158 	if (qops->dequeue == NULL)
159 		qops->dequeue = noop_qdisc_ops.dequeue;
160 
161 	qops->next = NULL;
162 	*qp = qops;
163 	rc = 0;
164 out:
165 	write_unlock(&qdisc_mod_lock);
166 	return rc;
167 }
168 EXPORT_SYMBOL(register_qdisc);
169 
170 int unregister_qdisc(struct Qdisc_ops *qops)
171 {
172 	struct Qdisc_ops *q, **qp;
173 	int err = -ENOENT;
174 
175 	write_lock(&qdisc_mod_lock);
176 	for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
177 		if (q == qops)
178 			break;
179 	if (q) {
180 		*qp = q->next;
181 		q->next = NULL;
182 		err = 0;
183 	}
184 	write_unlock(&qdisc_mod_lock);
185 	return err;
186 }
187 EXPORT_SYMBOL(unregister_qdisc);
188 
189 /* We know handle. Find qdisc among all qdisc's attached to device
190    (root qdisc, all its children, children of children etc.)
191  */
192 
193 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
194 {
195 	struct Qdisc *q;
196 
197 	if (!(root->flags & TCQ_F_BUILTIN) &&
198 	    root->handle == handle)
199 		return root;
200 
201 	list_for_each_entry(q, &root->list, list) {
202 		if (q->handle == handle)
203 			return q;
204 	}
205 	return NULL;
206 }
207 
208 static void qdisc_list_add(struct Qdisc *q)
209 {
210 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
211 		list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
212 }
213 
214 void qdisc_list_del(struct Qdisc *q)
215 {
216 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
217 		list_del(&q->list);
218 }
219 EXPORT_SYMBOL(qdisc_list_del);
220 
221 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
222 {
223 	struct Qdisc *q;
224 
225 	q = qdisc_match_from_root(dev->qdisc, handle);
226 	if (q)
227 		goto out;
228 
229 	q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
230 out:
231 	return q;
232 }
233 
234 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
235 {
236 	unsigned long cl;
237 	struct Qdisc *leaf;
238 	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
239 
240 	if (cops == NULL)
241 		return NULL;
242 	cl = cops->get(p, classid);
243 
244 	if (cl == 0)
245 		return NULL;
246 	leaf = cops->leaf(p, cl);
247 	cops->put(p, cl);
248 	return leaf;
249 }
250 
251 /* Find queueing discipline by name */
252 
253 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
254 {
255 	struct Qdisc_ops *q = NULL;
256 
257 	if (kind) {
258 		read_lock(&qdisc_mod_lock);
259 		for (q = qdisc_base; q; q = q->next) {
260 			if (nla_strcmp(kind, q->id) == 0) {
261 				if (!try_module_get(q->owner))
262 					q = NULL;
263 				break;
264 			}
265 		}
266 		read_unlock(&qdisc_mod_lock);
267 	}
268 	return q;
269 }
270 
271 static struct qdisc_rate_table *qdisc_rtab_list;
272 
273 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
274 {
275 	struct qdisc_rate_table *rtab;
276 
277 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
278 		if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
279 			rtab->refcnt++;
280 			return rtab;
281 		}
282 	}
283 
284 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
285 	    nla_len(tab) != TC_RTAB_SIZE)
286 		return NULL;
287 
288 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
289 	if (rtab) {
290 		rtab->rate = *r;
291 		rtab->refcnt = 1;
292 		memcpy(rtab->data, nla_data(tab), 1024);
293 		rtab->next = qdisc_rtab_list;
294 		qdisc_rtab_list = rtab;
295 	}
296 	return rtab;
297 }
298 EXPORT_SYMBOL(qdisc_get_rtab);
299 
300 void qdisc_put_rtab(struct qdisc_rate_table *tab)
301 {
302 	struct qdisc_rate_table *rtab, **rtabp;
303 
304 	if (!tab || --tab->refcnt)
305 		return;
306 
307 	for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
308 		if (rtab == tab) {
309 			*rtabp = rtab->next;
310 			kfree(rtab);
311 			return;
312 		}
313 	}
314 }
315 EXPORT_SYMBOL(qdisc_put_rtab);
316 
317 static LIST_HEAD(qdisc_stab_list);
318 static DEFINE_SPINLOCK(qdisc_stab_lock);
319 
320 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
321 	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
322 	[TCA_STAB_DATA] = { .type = NLA_BINARY },
323 };
324 
325 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
326 {
327 	struct nlattr *tb[TCA_STAB_MAX + 1];
328 	struct qdisc_size_table *stab;
329 	struct tc_sizespec *s;
330 	unsigned int tsize = 0;
331 	u16 *tab = NULL;
332 	int err;
333 
334 	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
335 	if (err < 0)
336 		return ERR_PTR(err);
337 	if (!tb[TCA_STAB_BASE])
338 		return ERR_PTR(-EINVAL);
339 
340 	s = nla_data(tb[TCA_STAB_BASE]);
341 
342 	if (s->tsize > 0) {
343 		if (!tb[TCA_STAB_DATA])
344 			return ERR_PTR(-EINVAL);
345 		tab = nla_data(tb[TCA_STAB_DATA]);
346 		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
347 	}
348 
349 	if (!s || tsize != s->tsize || (!tab && tsize > 0))
350 		return ERR_PTR(-EINVAL);
351 
352 	spin_lock(&qdisc_stab_lock);
353 
354 	list_for_each_entry(stab, &qdisc_stab_list, list) {
355 		if (memcmp(&stab->szopts, s, sizeof(*s)))
356 			continue;
357 		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
358 			continue;
359 		stab->refcnt++;
360 		spin_unlock(&qdisc_stab_lock);
361 		return stab;
362 	}
363 
364 	spin_unlock(&qdisc_stab_lock);
365 
366 	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
367 	if (!stab)
368 		return ERR_PTR(-ENOMEM);
369 
370 	stab->refcnt = 1;
371 	stab->szopts = *s;
372 	if (tsize > 0)
373 		memcpy(stab->data, tab, tsize * sizeof(u16));
374 
375 	spin_lock(&qdisc_stab_lock);
376 	list_add_tail(&stab->list, &qdisc_stab_list);
377 	spin_unlock(&qdisc_stab_lock);
378 
379 	return stab;
380 }
381 
382 void qdisc_put_stab(struct qdisc_size_table *tab)
383 {
384 	if (!tab)
385 		return;
386 
387 	spin_lock(&qdisc_stab_lock);
388 
389 	if (--tab->refcnt == 0) {
390 		list_del(&tab->list);
391 		kfree(tab);
392 	}
393 
394 	spin_unlock(&qdisc_stab_lock);
395 }
396 EXPORT_SYMBOL(qdisc_put_stab);
397 
398 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
399 {
400 	struct nlattr *nest;
401 
402 	nest = nla_nest_start(skb, TCA_STAB);
403 	if (nest == NULL)
404 		goto nla_put_failure;
405 	NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
406 	nla_nest_end(skb, nest);
407 
408 	return skb->len;
409 
410 nla_put_failure:
411 	return -1;
412 }
413 
414 void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
415 {
416 	int pkt_len, slot;
417 
418 	pkt_len = skb->len + stab->szopts.overhead;
419 	if (unlikely(!stab->szopts.tsize))
420 		goto out;
421 
422 	slot = pkt_len + stab->szopts.cell_align;
423 	if (unlikely(slot < 0))
424 		slot = 0;
425 
426 	slot >>= stab->szopts.cell_log;
427 	if (likely(slot < stab->szopts.tsize))
428 		pkt_len = stab->data[slot];
429 	else
430 		pkt_len = stab->data[stab->szopts.tsize - 1] *
431 				(slot / stab->szopts.tsize) +
432 				stab->data[slot % stab->szopts.tsize];
433 
434 	pkt_len <<= stab->szopts.size_log;
435 out:
436 	if (unlikely(pkt_len < 1))
437 		pkt_len = 1;
438 	qdisc_skb_cb(skb)->pkt_len = pkt_len;
439 }
440 EXPORT_SYMBOL(qdisc_calculate_pkt_len);
441 
442 void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
443 {
444 	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
445 		printk(KERN_WARNING
446 		       "%s: %s qdisc %X: is non-work-conserving?\n",
447 		       txt, qdisc->ops->id, qdisc->handle >> 16);
448 		qdisc->flags |= TCQ_F_WARN_NONWC;
449 	}
450 }
451 EXPORT_SYMBOL(qdisc_warn_nonwc);
452 
453 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
454 {
455 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
456 						 timer);
457 
458 	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
459 	__netif_schedule(qdisc_root(wd->qdisc));
460 
461 	return HRTIMER_NORESTART;
462 }
463 
464 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
465 {
466 	hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
467 	wd->timer.function = qdisc_watchdog;
468 	wd->qdisc = qdisc;
469 }
470 EXPORT_SYMBOL(qdisc_watchdog_init);
471 
472 void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
473 {
474 	ktime_t time;
475 
476 	if (test_bit(__QDISC_STATE_DEACTIVATED,
477 		     &qdisc_root_sleeping(wd->qdisc)->state))
478 		return;
479 
480 	wd->qdisc->flags |= TCQ_F_THROTTLED;
481 	time = ktime_set(0, 0);
482 	time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
483 	hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
484 }
485 EXPORT_SYMBOL(qdisc_watchdog_schedule);
486 
487 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
488 {
489 	hrtimer_cancel(&wd->timer);
490 	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
491 }
492 EXPORT_SYMBOL(qdisc_watchdog_cancel);
493 
494 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
495 {
496 	unsigned int size = n * sizeof(struct hlist_head), i;
497 	struct hlist_head *h;
498 
499 	if (size <= PAGE_SIZE)
500 		h = kmalloc(size, GFP_KERNEL);
501 	else
502 		h = (struct hlist_head *)
503 			__get_free_pages(GFP_KERNEL, get_order(size));
504 
505 	if (h != NULL) {
506 		for (i = 0; i < n; i++)
507 			INIT_HLIST_HEAD(&h[i]);
508 	}
509 	return h;
510 }
511 
512 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
513 {
514 	unsigned int size = n * sizeof(struct hlist_head);
515 
516 	if (size <= PAGE_SIZE)
517 		kfree(h);
518 	else
519 		free_pages((unsigned long)h, get_order(size));
520 }
521 
522 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
523 {
524 	struct Qdisc_class_common *cl;
525 	struct hlist_node *n, *next;
526 	struct hlist_head *nhash, *ohash;
527 	unsigned int nsize, nmask, osize;
528 	unsigned int i, h;
529 
530 	/* Rehash when load factor exceeds 0.75 */
531 	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
532 		return;
533 	nsize = clhash->hashsize * 2;
534 	nmask = nsize - 1;
535 	nhash = qdisc_class_hash_alloc(nsize);
536 	if (nhash == NULL)
537 		return;
538 
539 	ohash = clhash->hash;
540 	osize = clhash->hashsize;
541 
542 	sch_tree_lock(sch);
543 	for (i = 0; i < osize; i++) {
544 		hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
545 			h = qdisc_class_hash(cl->classid, nmask);
546 			hlist_add_head(&cl->hnode, &nhash[h]);
547 		}
548 	}
549 	clhash->hash     = nhash;
550 	clhash->hashsize = nsize;
551 	clhash->hashmask = nmask;
552 	sch_tree_unlock(sch);
553 
554 	qdisc_class_hash_free(ohash, osize);
555 }
556 EXPORT_SYMBOL(qdisc_class_hash_grow);
557 
558 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
559 {
560 	unsigned int size = 4;
561 
562 	clhash->hash = qdisc_class_hash_alloc(size);
563 	if (clhash->hash == NULL)
564 		return -ENOMEM;
565 	clhash->hashsize  = size;
566 	clhash->hashmask  = size - 1;
567 	clhash->hashelems = 0;
568 	return 0;
569 }
570 EXPORT_SYMBOL(qdisc_class_hash_init);
571 
572 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
573 {
574 	qdisc_class_hash_free(clhash->hash, clhash->hashsize);
575 }
576 EXPORT_SYMBOL(qdisc_class_hash_destroy);
577 
578 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
579 			     struct Qdisc_class_common *cl)
580 {
581 	unsigned int h;
582 
583 	INIT_HLIST_NODE(&cl->hnode);
584 	h = qdisc_class_hash(cl->classid, clhash->hashmask);
585 	hlist_add_head(&cl->hnode, &clhash->hash[h]);
586 	clhash->hashelems++;
587 }
588 EXPORT_SYMBOL(qdisc_class_hash_insert);
589 
590 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
591 			     struct Qdisc_class_common *cl)
592 {
593 	hlist_del(&cl->hnode);
594 	clhash->hashelems--;
595 }
596 EXPORT_SYMBOL(qdisc_class_hash_remove);
597 
598 /* Allocate an unique handle from space managed by kernel */
599 
600 static u32 qdisc_alloc_handle(struct net_device *dev)
601 {
602 	int i = 0x10000;
603 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
604 
605 	do {
606 		autohandle += TC_H_MAKE(0x10000U, 0);
607 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
608 			autohandle = TC_H_MAKE(0x80000000U, 0);
609 	} while	(qdisc_lookup(dev, autohandle) && --i > 0);
610 
611 	return i>0 ? autohandle : 0;
612 }
613 
614 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
615 {
616 	const struct Qdisc_class_ops *cops;
617 	unsigned long cl;
618 	u32 parentid;
619 
620 	if (n == 0)
621 		return;
622 	while ((parentid = sch->parent)) {
623 		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
624 			return;
625 
626 		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
627 		if (sch == NULL) {
628 			WARN_ON(parentid != TC_H_ROOT);
629 			return;
630 		}
631 		cops = sch->ops->cl_ops;
632 		if (cops->qlen_notify) {
633 			cl = cops->get(sch, parentid);
634 			cops->qlen_notify(sch, cl);
635 			cops->put(sch, cl);
636 		}
637 		sch->q.qlen -= n;
638 	}
639 }
640 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
641 
642 static void notify_and_destroy(struct sk_buff *skb, struct nlmsghdr *n, u32 clid,
643 			       struct Qdisc *old, struct Qdisc *new)
644 {
645 	if (new || old)
646 		qdisc_notify(skb, n, clid, old, new);
647 
648 	if (old)
649 		qdisc_destroy(old);
650 }
651 
652 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
653  * to device "dev".
654  *
655  * When appropriate send a netlink notification using 'skb'
656  * and "n".
657  *
658  * On success, destroy old qdisc.
659  */
660 
661 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
662 		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
663 		       struct Qdisc *new, struct Qdisc *old)
664 {
665 	struct Qdisc *q = old;
666 	int err = 0;
667 
668 	if (parent == NULL) {
669 		unsigned int i, num_q, ingress;
670 
671 		ingress = 0;
672 		num_q = dev->num_tx_queues;
673 		if ((q && q->flags & TCQ_F_INGRESS) ||
674 		    (new && new->flags & TCQ_F_INGRESS)) {
675 			num_q = 1;
676 			ingress = 1;
677 		}
678 
679 		if (dev->flags & IFF_UP)
680 			dev_deactivate(dev);
681 
682 		if (new && new->ops->attach) {
683 			new->ops->attach(new);
684 			num_q = 0;
685 		}
686 
687 		for (i = 0; i < num_q; i++) {
688 			struct netdev_queue *dev_queue = &dev->rx_queue;
689 
690 			if (!ingress)
691 				dev_queue = netdev_get_tx_queue(dev, i);
692 
693 			old = dev_graft_qdisc(dev_queue, new);
694 			if (new && i > 0)
695 				atomic_inc(&new->refcnt);
696 
697 			if (!ingress)
698 				qdisc_destroy(old);
699 		}
700 
701 		if (!ingress) {
702 			notify_and_destroy(skb, n, classid, dev->qdisc, new);
703 			if (new && !new->ops->attach)
704 				atomic_inc(&new->refcnt);
705 			dev->qdisc = new ? : &noop_qdisc;
706 		} else {
707 			notify_and_destroy(skb, n, classid, old, new);
708 		}
709 
710 		if (dev->flags & IFF_UP)
711 			dev_activate(dev);
712 	} else {
713 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
714 
715 		err = -EOPNOTSUPP;
716 		if (cops && cops->graft) {
717 			unsigned long cl = cops->get(parent, classid);
718 			if (cl) {
719 				err = cops->graft(parent, cl, new, &old);
720 				cops->put(parent, cl);
721 			} else
722 				err = -ENOENT;
723 		}
724 		if (!err)
725 			notify_and_destroy(skb, n, classid, old, new);
726 	}
727 	return err;
728 }
729 
730 /* lockdep annotation is needed for ingress; egress gets it only for name */
731 static struct lock_class_key qdisc_tx_lock;
732 static struct lock_class_key qdisc_rx_lock;
733 
734 /*
735    Allocate and initialize new qdisc.
736 
737    Parameters are passed via opt.
738  */
739 
740 static struct Qdisc *
741 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
742 	     struct Qdisc *p, u32 parent, u32 handle,
743 	     struct nlattr **tca, int *errp)
744 {
745 	int err;
746 	struct nlattr *kind = tca[TCA_KIND];
747 	struct Qdisc *sch;
748 	struct Qdisc_ops *ops;
749 	struct qdisc_size_table *stab;
750 
751 	ops = qdisc_lookup_ops(kind);
752 #ifdef CONFIG_MODULES
753 	if (ops == NULL && kind != NULL) {
754 		char name[IFNAMSIZ];
755 		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
756 			/* We dropped the RTNL semaphore in order to
757 			 * perform the module load.  So, even if we
758 			 * succeeded in loading the module we have to
759 			 * tell the caller to replay the request.  We
760 			 * indicate this using -EAGAIN.
761 			 * We replay the request because the device may
762 			 * go away in the mean time.
763 			 */
764 			rtnl_unlock();
765 			request_module("sch_%s", name);
766 			rtnl_lock();
767 			ops = qdisc_lookup_ops(kind);
768 			if (ops != NULL) {
769 				/* We will try again qdisc_lookup_ops,
770 				 * so don't keep a reference.
771 				 */
772 				module_put(ops->owner);
773 				err = -EAGAIN;
774 				goto err_out;
775 			}
776 		}
777 	}
778 #endif
779 
780 	err = -ENOENT;
781 	if (ops == NULL)
782 		goto err_out;
783 
784 	sch = qdisc_alloc(dev_queue, ops);
785 	if (IS_ERR(sch)) {
786 		err = PTR_ERR(sch);
787 		goto err_out2;
788 	}
789 
790 	sch->parent = parent;
791 
792 	if (handle == TC_H_INGRESS) {
793 		sch->flags |= TCQ_F_INGRESS;
794 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
795 		lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
796 	} else {
797 		if (handle == 0) {
798 			handle = qdisc_alloc_handle(dev);
799 			err = -ENOMEM;
800 			if (handle == 0)
801 				goto err_out3;
802 		}
803 		lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
804 	}
805 
806 	sch->handle = handle;
807 
808 	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
809 		if (tca[TCA_STAB]) {
810 			stab = qdisc_get_stab(tca[TCA_STAB]);
811 			if (IS_ERR(stab)) {
812 				err = PTR_ERR(stab);
813 				goto err_out4;
814 			}
815 			sch->stab = stab;
816 		}
817 		if (tca[TCA_RATE]) {
818 			spinlock_t *root_lock;
819 
820 			err = -EOPNOTSUPP;
821 			if (sch->flags & TCQ_F_MQROOT)
822 				goto err_out4;
823 
824 			if ((sch->parent != TC_H_ROOT) &&
825 			    !(sch->flags & TCQ_F_INGRESS) &&
826 			    (!p || !(p->flags & TCQ_F_MQROOT)))
827 				root_lock = qdisc_root_sleeping_lock(sch);
828 			else
829 				root_lock = qdisc_lock(sch);
830 
831 			err = gen_new_estimator(&sch->bstats, &sch->rate_est,
832 						root_lock, tca[TCA_RATE]);
833 			if (err)
834 				goto err_out4;
835 		}
836 
837 		qdisc_list_add(sch);
838 
839 		return sch;
840 	}
841 err_out3:
842 	dev_put(dev);
843 	kfree((char *) sch - sch->padded);
844 err_out2:
845 	module_put(ops->owner);
846 err_out:
847 	*errp = err;
848 	return NULL;
849 
850 err_out4:
851 	/*
852 	 * Any broken qdiscs that would require a ops->reset() here?
853 	 * The qdisc was never in action so it shouldn't be necessary.
854 	 */
855 	qdisc_put_stab(sch->stab);
856 	if (ops->destroy)
857 		ops->destroy(sch);
858 	goto err_out3;
859 }
860 
861 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
862 {
863 	struct qdisc_size_table *stab = NULL;
864 	int err = 0;
865 
866 	if (tca[TCA_OPTIONS]) {
867 		if (sch->ops->change == NULL)
868 			return -EINVAL;
869 		err = sch->ops->change(sch, tca[TCA_OPTIONS]);
870 		if (err)
871 			return err;
872 	}
873 
874 	if (tca[TCA_STAB]) {
875 		stab = qdisc_get_stab(tca[TCA_STAB]);
876 		if (IS_ERR(stab))
877 			return PTR_ERR(stab);
878 	}
879 
880 	qdisc_put_stab(sch->stab);
881 	sch->stab = stab;
882 
883 	if (tca[TCA_RATE]) {
884 		/* NB: ignores errors from replace_estimator
885 		   because change can't be undone. */
886 		if (sch->flags & TCQ_F_MQROOT)
887 			goto out;
888 		gen_replace_estimator(&sch->bstats, &sch->rate_est,
889 					    qdisc_root_sleeping_lock(sch),
890 					    tca[TCA_RATE]);
891 	}
892 out:
893 	return 0;
894 }
895 
896 struct check_loop_arg
897 {
898 	struct qdisc_walker 	w;
899 	struct Qdisc		*p;
900 	int			depth;
901 };
902 
903 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
904 
905 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
906 {
907 	struct check_loop_arg	arg;
908 
909 	if (q->ops->cl_ops == NULL)
910 		return 0;
911 
912 	arg.w.stop = arg.w.skip = arg.w.count = 0;
913 	arg.w.fn = check_loop_fn;
914 	arg.depth = depth;
915 	arg.p = p;
916 	q->ops->cl_ops->walk(q, &arg.w);
917 	return arg.w.stop ? -ELOOP : 0;
918 }
919 
920 static int
921 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
922 {
923 	struct Qdisc *leaf;
924 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
925 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
926 
927 	leaf = cops->leaf(q, cl);
928 	if (leaf) {
929 		if (leaf == arg->p || arg->depth > 7)
930 			return -ELOOP;
931 		return check_loop(leaf, arg->p, arg->depth + 1);
932 	}
933 	return 0;
934 }
935 
936 /*
937  * Delete/get qdisc.
938  */
939 
940 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
941 {
942 	struct net *net = sock_net(skb->sk);
943 	struct tcmsg *tcm = NLMSG_DATA(n);
944 	struct nlattr *tca[TCA_MAX + 1];
945 	struct net_device *dev;
946 	u32 clid = tcm->tcm_parent;
947 	struct Qdisc *q = NULL;
948 	struct Qdisc *p = NULL;
949 	int err;
950 
951 	if (!net_eq(net, &init_net))
952 		return -EINVAL;
953 
954 	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
955 		return -ENODEV;
956 
957 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
958 	if (err < 0)
959 		return err;
960 
961 	if (clid) {
962 		if (clid != TC_H_ROOT) {
963 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
964 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
965 					return -ENOENT;
966 				q = qdisc_leaf(p, clid);
967 			} else { /* ingress */
968 				q = dev->rx_queue.qdisc_sleeping;
969 			}
970 		} else {
971 			q = dev->qdisc;
972 		}
973 		if (!q)
974 			return -ENOENT;
975 
976 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
977 			return -EINVAL;
978 	} else {
979 		if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
980 			return -ENOENT;
981 	}
982 
983 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
984 		return -EINVAL;
985 
986 	if (n->nlmsg_type == RTM_DELQDISC) {
987 		if (!clid)
988 			return -EINVAL;
989 		if (q->handle == 0)
990 			return -ENOENT;
991 		if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
992 			return err;
993 	} else {
994 		qdisc_notify(skb, n, clid, NULL, q);
995 	}
996 	return 0;
997 }
998 
999 /*
1000    Create/change qdisc.
1001  */
1002 
1003 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1004 {
1005 	struct net *net = sock_net(skb->sk);
1006 	struct tcmsg *tcm;
1007 	struct nlattr *tca[TCA_MAX + 1];
1008 	struct net_device *dev;
1009 	u32 clid;
1010 	struct Qdisc *q, *p;
1011 	int err;
1012 
1013 	if (!net_eq(net, &init_net))
1014 		return -EINVAL;
1015 
1016 replay:
1017 	/* Reinit, just in case something touches this. */
1018 	tcm = NLMSG_DATA(n);
1019 	clid = tcm->tcm_parent;
1020 	q = p = NULL;
1021 
1022 	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1023 		return -ENODEV;
1024 
1025 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1026 	if (err < 0)
1027 		return err;
1028 
1029 	if (clid) {
1030 		if (clid != TC_H_ROOT) {
1031 			if (clid != TC_H_INGRESS) {
1032 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
1033 					return -ENOENT;
1034 				q = qdisc_leaf(p, clid);
1035 			} else { /*ingress */
1036 				q = dev->rx_queue.qdisc_sleeping;
1037 			}
1038 		} else {
1039 			q = dev->qdisc;
1040 		}
1041 
1042 		/* It may be default qdisc, ignore it */
1043 		if (q && q->handle == 0)
1044 			q = NULL;
1045 
1046 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1047 			if (tcm->tcm_handle) {
1048 				if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
1049 					return -EEXIST;
1050 				if (TC_H_MIN(tcm->tcm_handle))
1051 					return -EINVAL;
1052 				if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1053 					goto create_n_graft;
1054 				if (n->nlmsg_flags&NLM_F_EXCL)
1055 					return -EEXIST;
1056 				if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1057 					return -EINVAL;
1058 				if (q == p ||
1059 				    (p && check_loop(q, p, 0)))
1060 					return -ELOOP;
1061 				atomic_inc(&q->refcnt);
1062 				goto graft;
1063 			} else {
1064 				if (q == NULL)
1065 					goto create_n_graft;
1066 
1067 				/* This magic test requires explanation.
1068 				 *
1069 				 *   We know, that some child q is already
1070 				 *   attached to this parent and have choice:
1071 				 *   either to change it or to create/graft new one.
1072 				 *
1073 				 *   1. We are allowed to create/graft only
1074 				 *   if CREATE and REPLACE flags are set.
1075 				 *
1076 				 *   2. If EXCL is set, requestor wanted to say,
1077 				 *   that qdisc tcm_handle is not expected
1078 				 *   to exist, so that we choose create/graft too.
1079 				 *
1080 				 *   3. The last case is when no flags are set.
1081 				 *   Alas, it is sort of hole in API, we
1082 				 *   cannot decide what to do unambiguously.
1083 				 *   For now we select create/graft, if
1084 				 *   user gave KIND, which does not match existing.
1085 				 */
1086 				if ((n->nlmsg_flags&NLM_F_CREATE) &&
1087 				    (n->nlmsg_flags&NLM_F_REPLACE) &&
1088 				    ((n->nlmsg_flags&NLM_F_EXCL) ||
1089 				     (tca[TCA_KIND] &&
1090 				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1091 					goto create_n_graft;
1092 			}
1093 		}
1094 	} else {
1095 		if (!tcm->tcm_handle)
1096 			return -EINVAL;
1097 		q = qdisc_lookup(dev, tcm->tcm_handle);
1098 	}
1099 
1100 	/* Change qdisc parameters */
1101 	if (q == NULL)
1102 		return -ENOENT;
1103 	if (n->nlmsg_flags&NLM_F_EXCL)
1104 		return -EEXIST;
1105 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1106 		return -EINVAL;
1107 	err = qdisc_change(q, tca);
1108 	if (err == 0)
1109 		qdisc_notify(skb, n, clid, NULL, q);
1110 	return err;
1111 
1112 create_n_graft:
1113 	if (!(n->nlmsg_flags&NLM_F_CREATE))
1114 		return -ENOENT;
1115 	if (clid == TC_H_INGRESS)
1116 		q = qdisc_create(dev, &dev->rx_queue, p,
1117 				 tcm->tcm_parent, tcm->tcm_parent,
1118 				 tca, &err);
1119 	else {
1120 		struct netdev_queue *dev_queue;
1121 
1122 		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1123 			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1124 		else if (p)
1125 			dev_queue = p->dev_queue;
1126 		else
1127 			dev_queue = netdev_get_tx_queue(dev, 0);
1128 
1129 		q = qdisc_create(dev, dev_queue, p,
1130 				 tcm->tcm_parent, tcm->tcm_handle,
1131 				 tca, &err);
1132 	}
1133 	if (q == NULL) {
1134 		if (err == -EAGAIN)
1135 			goto replay;
1136 		return err;
1137 	}
1138 
1139 graft:
1140 	err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1141 	if (err) {
1142 		if (q)
1143 			qdisc_destroy(q);
1144 		return err;
1145 	}
1146 
1147 	return 0;
1148 }
1149 
1150 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1151 			 u32 pid, u32 seq, u16 flags, int event)
1152 {
1153 	struct tcmsg *tcm;
1154 	struct nlmsghdr  *nlh;
1155 	unsigned char *b = skb_tail_pointer(skb);
1156 	struct gnet_dump d;
1157 
1158 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1159 	tcm = NLMSG_DATA(nlh);
1160 	tcm->tcm_family = AF_UNSPEC;
1161 	tcm->tcm__pad1 = 0;
1162 	tcm->tcm__pad2 = 0;
1163 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1164 	tcm->tcm_parent = clid;
1165 	tcm->tcm_handle = q->handle;
1166 	tcm->tcm_info = atomic_read(&q->refcnt);
1167 	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1168 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
1169 		goto nla_put_failure;
1170 	q->qstats.qlen = q->q.qlen;
1171 
1172 	if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
1173 		goto nla_put_failure;
1174 
1175 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1176 					 qdisc_root_sleeping_lock(q), &d) < 0)
1177 		goto nla_put_failure;
1178 
1179 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1180 		goto nla_put_failure;
1181 
1182 	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1183 	    gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1184 	    gnet_stats_copy_queue(&d, &q->qstats) < 0)
1185 		goto nla_put_failure;
1186 
1187 	if (gnet_stats_finish_copy(&d) < 0)
1188 		goto nla_put_failure;
1189 
1190 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1191 	return skb->len;
1192 
1193 nlmsg_failure:
1194 nla_put_failure:
1195 	nlmsg_trim(skb, b);
1196 	return -1;
1197 }
1198 
1199 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1200 			u32 clid, struct Qdisc *old, struct Qdisc *new)
1201 {
1202 	struct sk_buff *skb;
1203 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1204 
1205 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1206 	if (!skb)
1207 		return -ENOBUFS;
1208 
1209 	if (old && old->handle) {
1210 		if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1211 			goto err_out;
1212 	}
1213 	if (new) {
1214 		if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1215 			goto err_out;
1216 	}
1217 
1218 	if (skb->len)
1219 		return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1220 
1221 err_out:
1222 	kfree_skb(skb);
1223 	return -EINVAL;
1224 }
1225 
1226 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1227 {
1228 	return (q->flags & TCQ_F_BUILTIN) ? true : false;
1229 }
1230 
1231 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1232 			      struct netlink_callback *cb,
1233 			      int *q_idx_p, int s_q_idx)
1234 {
1235 	int ret = 0, q_idx = *q_idx_p;
1236 	struct Qdisc *q;
1237 
1238 	if (!root)
1239 		return 0;
1240 
1241 	q = root;
1242 	if (q_idx < s_q_idx) {
1243 		q_idx++;
1244 	} else {
1245 		if (!tc_qdisc_dump_ignore(q) &&
1246 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1247 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1248 			goto done;
1249 		q_idx++;
1250 	}
1251 	list_for_each_entry(q, &root->list, list) {
1252 		if (q_idx < s_q_idx) {
1253 			q_idx++;
1254 			continue;
1255 		}
1256 		if (!tc_qdisc_dump_ignore(q) &&
1257 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1258 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1259 			goto done;
1260 		q_idx++;
1261 	}
1262 
1263 out:
1264 	*q_idx_p = q_idx;
1265 	return ret;
1266 done:
1267 	ret = -1;
1268 	goto out;
1269 }
1270 
1271 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1272 {
1273 	struct net *net = sock_net(skb->sk);
1274 	int idx, q_idx;
1275 	int s_idx, s_q_idx;
1276 	struct net_device *dev;
1277 
1278 	if (!net_eq(net, &init_net))
1279 		return 0;
1280 
1281 	s_idx = cb->args[0];
1282 	s_q_idx = q_idx = cb->args[1];
1283 
1284 	rcu_read_lock();
1285 	idx = 0;
1286 	for_each_netdev_rcu(&init_net, dev) {
1287 		struct netdev_queue *dev_queue;
1288 
1289 		if (idx < s_idx)
1290 			goto cont;
1291 		if (idx > s_idx)
1292 			s_q_idx = 0;
1293 		q_idx = 0;
1294 
1295 		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1296 			goto done;
1297 
1298 		dev_queue = &dev->rx_queue;
1299 		if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1300 			goto done;
1301 
1302 cont:
1303 		idx++;
1304 	}
1305 
1306 done:
1307 	rcu_read_unlock();
1308 
1309 	cb->args[0] = idx;
1310 	cb->args[1] = q_idx;
1311 
1312 	return skb->len;
1313 }
1314 
1315 
1316 
1317 /************************************************
1318  *	Traffic classes manipulation.		*
1319  ************************************************/
1320 
1321 
1322 
1323 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1324 {
1325 	struct net *net = sock_net(skb->sk);
1326 	struct tcmsg *tcm = NLMSG_DATA(n);
1327 	struct nlattr *tca[TCA_MAX + 1];
1328 	struct net_device *dev;
1329 	struct Qdisc *q = NULL;
1330 	const struct Qdisc_class_ops *cops;
1331 	unsigned long cl = 0;
1332 	unsigned long new_cl;
1333 	u32 pid = tcm->tcm_parent;
1334 	u32 clid = tcm->tcm_handle;
1335 	u32 qid = TC_H_MAJ(clid);
1336 	int err;
1337 
1338 	if (!net_eq(net, &init_net))
1339 		return -EINVAL;
1340 
1341 	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1342 		return -ENODEV;
1343 
1344 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1345 	if (err < 0)
1346 		return err;
1347 
1348 	/*
1349 	   parent == TC_H_UNSPEC - unspecified parent.
1350 	   parent == TC_H_ROOT   - class is root, which has no parent.
1351 	   parent == X:0	 - parent is root class.
1352 	   parent == X:Y	 - parent is a node in hierarchy.
1353 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
1354 
1355 	   handle == 0:0	 - generate handle from kernel pool.
1356 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
1357 	   handle == X:Y	 - clear.
1358 	   handle == X:0	 - root class.
1359 	 */
1360 
1361 	/* Step 1. Determine qdisc handle X:0 */
1362 
1363 	if (pid != TC_H_ROOT) {
1364 		u32 qid1 = TC_H_MAJ(pid);
1365 
1366 		if (qid && qid1) {
1367 			/* If both majors are known, they must be identical. */
1368 			if (qid != qid1)
1369 				return -EINVAL;
1370 		} else if (qid1) {
1371 			qid = qid1;
1372 		} else if (qid == 0)
1373 			qid = dev->qdisc->handle;
1374 
1375 		/* Now qid is genuine qdisc handle consistent
1376 		   both with parent and child.
1377 
1378 		   TC_H_MAJ(pid) still may be unspecified, complete it now.
1379 		 */
1380 		if (pid)
1381 			pid = TC_H_MAKE(qid, pid);
1382 	} else {
1383 		if (qid == 0)
1384 			qid = dev->qdisc->handle;
1385 	}
1386 
1387 	/* OK. Locate qdisc */
1388 	if ((q = qdisc_lookup(dev, qid)) == NULL)
1389 		return -ENOENT;
1390 
1391 	/* An check that it supports classes */
1392 	cops = q->ops->cl_ops;
1393 	if (cops == NULL)
1394 		return -EINVAL;
1395 
1396 	/* Now try to get class */
1397 	if (clid == 0) {
1398 		if (pid == TC_H_ROOT)
1399 			clid = qid;
1400 	} else
1401 		clid = TC_H_MAKE(qid, clid);
1402 
1403 	if (clid)
1404 		cl = cops->get(q, clid);
1405 
1406 	if (cl == 0) {
1407 		err = -ENOENT;
1408 		if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1409 			goto out;
1410 	} else {
1411 		switch (n->nlmsg_type) {
1412 		case RTM_NEWTCLASS:
1413 			err = -EEXIST;
1414 			if (n->nlmsg_flags&NLM_F_EXCL)
1415 				goto out;
1416 			break;
1417 		case RTM_DELTCLASS:
1418 			err = -EOPNOTSUPP;
1419 			if (cops->delete)
1420 				err = cops->delete(q, cl);
1421 			if (err == 0)
1422 				tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1423 			goto out;
1424 		case RTM_GETTCLASS:
1425 			err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1426 			goto out;
1427 		default:
1428 			err = -EINVAL;
1429 			goto out;
1430 		}
1431 	}
1432 
1433 	new_cl = cl;
1434 	err = -EOPNOTSUPP;
1435 	if (cops->change)
1436 		err = cops->change(q, clid, pid, tca, &new_cl);
1437 	if (err == 0)
1438 		tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1439 
1440 out:
1441 	if (cl)
1442 		cops->put(q, cl);
1443 
1444 	return err;
1445 }
1446 
1447 
1448 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1449 			  unsigned long cl,
1450 			  u32 pid, u32 seq, u16 flags, int event)
1451 {
1452 	struct tcmsg *tcm;
1453 	struct nlmsghdr  *nlh;
1454 	unsigned char *b = skb_tail_pointer(skb);
1455 	struct gnet_dump d;
1456 	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1457 
1458 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1459 	tcm = NLMSG_DATA(nlh);
1460 	tcm->tcm_family = AF_UNSPEC;
1461 	tcm->tcm__pad1 = 0;
1462 	tcm->tcm__pad2 = 0;
1463 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1464 	tcm->tcm_parent = q->handle;
1465 	tcm->tcm_handle = q->handle;
1466 	tcm->tcm_info = 0;
1467 	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1468 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1469 		goto nla_put_failure;
1470 
1471 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1472 					 qdisc_root_sleeping_lock(q), &d) < 0)
1473 		goto nla_put_failure;
1474 
1475 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1476 		goto nla_put_failure;
1477 
1478 	if (gnet_stats_finish_copy(&d) < 0)
1479 		goto nla_put_failure;
1480 
1481 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1482 	return skb->len;
1483 
1484 nlmsg_failure:
1485 nla_put_failure:
1486 	nlmsg_trim(skb, b);
1487 	return -1;
1488 }
1489 
1490 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1491 			  struct Qdisc *q, unsigned long cl, int event)
1492 {
1493 	struct sk_buff *skb;
1494 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1495 
1496 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1497 	if (!skb)
1498 		return -ENOBUFS;
1499 
1500 	if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1501 		kfree_skb(skb);
1502 		return -EINVAL;
1503 	}
1504 
1505 	return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1506 }
1507 
1508 struct qdisc_dump_args
1509 {
1510 	struct qdisc_walker w;
1511 	struct sk_buff *skb;
1512 	struct netlink_callback *cb;
1513 };
1514 
1515 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1516 {
1517 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1518 
1519 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1520 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1521 }
1522 
1523 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1524 				struct tcmsg *tcm, struct netlink_callback *cb,
1525 				int *t_p, int s_t)
1526 {
1527 	struct qdisc_dump_args arg;
1528 
1529 	if (tc_qdisc_dump_ignore(q) ||
1530 	    *t_p < s_t || !q->ops->cl_ops ||
1531 	    (tcm->tcm_parent &&
1532 	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1533 		(*t_p)++;
1534 		return 0;
1535 	}
1536 	if (*t_p > s_t)
1537 		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1538 	arg.w.fn = qdisc_class_dump;
1539 	arg.skb = skb;
1540 	arg.cb = cb;
1541 	arg.w.stop  = 0;
1542 	arg.w.skip = cb->args[1];
1543 	arg.w.count = 0;
1544 	q->ops->cl_ops->walk(q, &arg.w);
1545 	cb->args[1] = arg.w.count;
1546 	if (arg.w.stop)
1547 		return -1;
1548 	(*t_p)++;
1549 	return 0;
1550 }
1551 
1552 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1553 			       struct tcmsg *tcm, struct netlink_callback *cb,
1554 			       int *t_p, int s_t)
1555 {
1556 	struct Qdisc *q;
1557 
1558 	if (!root)
1559 		return 0;
1560 
1561 	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1562 		return -1;
1563 
1564 	list_for_each_entry(q, &root->list, list) {
1565 		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1566 			return -1;
1567 	}
1568 
1569 	return 0;
1570 }
1571 
1572 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1573 {
1574 	struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1575 	struct net *net = sock_net(skb->sk);
1576 	struct netdev_queue *dev_queue;
1577 	struct net_device *dev;
1578 	int t, s_t;
1579 
1580 	if (!net_eq(net, &init_net))
1581 		return 0;
1582 
1583 	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1584 		return 0;
1585 	if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1586 		return 0;
1587 
1588 	s_t = cb->args[0];
1589 	t = 0;
1590 
1591 	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1592 		goto done;
1593 
1594 	dev_queue = &dev->rx_queue;
1595 	if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1596 		goto done;
1597 
1598 done:
1599 	cb->args[0] = t;
1600 
1601 	dev_put(dev);
1602 	return skb->len;
1603 }
1604 
1605 /* Main classifier routine: scans classifier chain attached
1606    to this qdisc, (optionally) tests for protocol and asks
1607    specific classifiers.
1608  */
1609 int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1610 		       struct tcf_result *res)
1611 {
1612 	__be16 protocol = skb->protocol;
1613 	int err = 0;
1614 
1615 	for (; tp; tp = tp->next) {
1616 		if ((tp->protocol == protocol ||
1617 		     tp->protocol == htons(ETH_P_ALL)) &&
1618 		    (err = tp->classify(skb, tp, res)) >= 0) {
1619 #ifdef CONFIG_NET_CLS_ACT
1620 			if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1621 				skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1622 #endif
1623 			return err;
1624 		}
1625 	}
1626 	return -1;
1627 }
1628 EXPORT_SYMBOL(tc_classify_compat);
1629 
1630 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1631 		struct tcf_result *res)
1632 {
1633 	int err = 0;
1634 	__be16 protocol;
1635 #ifdef CONFIG_NET_CLS_ACT
1636 	struct tcf_proto *otp = tp;
1637 reclassify:
1638 #endif
1639 	protocol = skb->protocol;
1640 
1641 	err = tc_classify_compat(skb, tp, res);
1642 #ifdef CONFIG_NET_CLS_ACT
1643 	if (err == TC_ACT_RECLASSIFY) {
1644 		u32 verd = G_TC_VERD(skb->tc_verd);
1645 		tp = otp;
1646 
1647 		if (verd++ >= MAX_REC_LOOP) {
1648 			printk("rule prio %u protocol %02x reclassify loop, "
1649 			       "packet dropped\n",
1650 			       tp->prio&0xffff, ntohs(tp->protocol));
1651 			return TC_ACT_SHOT;
1652 		}
1653 		skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1654 		goto reclassify;
1655 	}
1656 #endif
1657 	return err;
1658 }
1659 EXPORT_SYMBOL(tc_classify);
1660 
1661 void tcf_destroy(struct tcf_proto *tp)
1662 {
1663 	tp->ops->destroy(tp);
1664 	module_put(tp->ops->owner);
1665 	kfree(tp);
1666 }
1667 
1668 void tcf_destroy_chain(struct tcf_proto **fl)
1669 {
1670 	struct tcf_proto *tp;
1671 
1672 	while ((tp = *fl) != NULL) {
1673 		*fl = tp->next;
1674 		tcf_destroy(tp);
1675 	}
1676 }
1677 EXPORT_SYMBOL(tcf_destroy_chain);
1678 
1679 #ifdef CONFIG_PROC_FS
1680 static int psched_show(struct seq_file *seq, void *v)
1681 {
1682 	struct timespec ts;
1683 
1684 	hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1685 	seq_printf(seq, "%08x %08x %08x %08x\n",
1686 		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1687 		   1000000,
1688 		   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1689 
1690 	return 0;
1691 }
1692 
1693 static int psched_open(struct inode *inode, struct file *file)
1694 {
1695 	return single_open(file, psched_show, PDE(inode)->data);
1696 }
1697 
1698 static const struct file_operations psched_fops = {
1699 	.owner = THIS_MODULE,
1700 	.open = psched_open,
1701 	.read  = seq_read,
1702 	.llseek = seq_lseek,
1703 	.release = single_release,
1704 };
1705 #endif
1706 
1707 static int __init pktsched_init(void)
1708 {
1709 	register_qdisc(&pfifo_qdisc_ops);
1710 	register_qdisc(&bfifo_qdisc_ops);
1711 	register_qdisc(&pfifo_head_drop_qdisc_ops);
1712 	register_qdisc(&mq_qdisc_ops);
1713 	proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
1714 
1715 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1716 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1717 	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1718 	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1719 	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1720 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1721 
1722 	return 0;
1723 }
1724 
1725 subsys_initcall(pktsched_init);
1726