xref: /linux/net/sched/sch_api.c (revision 96890d62523c2cddc2c053ad29de35c4d935cf11)
1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32 #include <linux/hashtable.h>
33 
34 #include <net/net_namespace.h>
35 #include <net/sock.h>
36 #include <net/netlink.h>
37 #include <net/pkt_sched.h>
38 #include <net/pkt_cls.h>
39 
40 /*
41 
42    Short review.
43    -------------
44 
45    This file consists of two interrelated parts:
46 
47    1. queueing disciplines manager frontend.
48    2. traffic classes manager frontend.
49 
50    Generally, queueing discipline ("qdisc") is a black box,
51    which is able to enqueue packets and to dequeue them (when
52    device is ready to send something) in order and at times
53    determined by algorithm hidden in it.
54 
55    qdisc's are divided to two categories:
56    - "queues", which have no internal structure visible from outside.
57    - "schedulers", which split all the packets to "traffic classes",
58      using "packet classifiers" (look at cls_api.c)
59 
60    In turn, classes may have child qdiscs (as rule, queues)
61    attached to them etc. etc. etc.
62 
63    The goal of the routines in this file is to translate
64    information supplied by user in the form of handles
65    to more intelligible for kernel form, to make some sanity
66    checks and part of work, which is common to all qdiscs
67    and to provide rtnetlink notifications.
68 
69    All real intelligent work is done inside qdisc modules.
70 
71 
72 
73    Every discipline has two major routines: enqueue and dequeue.
74 
75    ---dequeue
76 
77    dequeue usually returns a skb to send. It is allowed to return NULL,
78    but it does not mean that queue is empty, it just means that
79    discipline does not want to send anything this time.
80    Queue is really empty if q->q.qlen == 0.
81    For complicated disciplines with multiple queues q->q is not
82    real packet queue, but however q->q.qlen must be valid.
83 
84    ---enqueue
85 
86    enqueue returns 0, if packet was enqueued successfully.
87    If packet (this one or another one) was dropped, it returns
88    not zero error code.
89    NET_XMIT_DROP 	- this packet dropped
90      Expected action: do not backoff, but wait until queue will clear.
91    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
92      Expected action: backoff or ignore
93 
94    Auxiliary routines:
95 
96    ---peek
97 
98    like dequeue but without removing a packet from the queue
99 
100    ---reset
101 
102    returns qdisc to initial state: purge all buffers, clear all
103    timers, counters (except for statistics) etc.
104 
105    ---init
106 
107    initializes newly created qdisc.
108 
109    ---destroy
110 
111    destroys resources allocated by init and during lifetime of qdisc.
112 
113    ---change
114 
115    changes qdisc parameters.
116  */
117 
118 /* Protects list of registered TC modules. It is pure SMP lock. */
119 static DEFINE_RWLOCK(qdisc_mod_lock);
120 
121 
122 /************************************************
123  *	Queueing disciplines manipulation.	*
124  ************************************************/
125 
126 
127 /* The list of all installed queueing disciplines. */
128 
129 static struct Qdisc_ops *qdisc_base;
130 
131 /* Register/unregister queueing discipline */
132 
133 int register_qdisc(struct Qdisc_ops *qops)
134 {
135 	struct Qdisc_ops *q, **qp;
136 	int rc = -EEXIST;
137 
138 	write_lock(&qdisc_mod_lock);
139 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
140 		if (!strcmp(qops->id, q->id))
141 			goto out;
142 
143 	if (qops->enqueue == NULL)
144 		qops->enqueue = noop_qdisc_ops.enqueue;
145 	if (qops->peek == NULL) {
146 		if (qops->dequeue == NULL)
147 			qops->peek = noop_qdisc_ops.peek;
148 		else
149 			goto out_einval;
150 	}
151 	if (qops->dequeue == NULL)
152 		qops->dequeue = noop_qdisc_ops.dequeue;
153 
154 	if (qops->cl_ops) {
155 		const struct Qdisc_class_ops *cops = qops->cl_ops;
156 
157 		if (!(cops->find && cops->walk && cops->leaf))
158 			goto out_einval;
159 
160 		if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
161 			goto out_einval;
162 	}
163 
164 	qops->next = NULL;
165 	*qp = qops;
166 	rc = 0;
167 out:
168 	write_unlock(&qdisc_mod_lock);
169 	return rc;
170 
171 out_einval:
172 	rc = -EINVAL;
173 	goto out;
174 }
175 EXPORT_SYMBOL(register_qdisc);
176 
177 int unregister_qdisc(struct Qdisc_ops *qops)
178 {
179 	struct Qdisc_ops *q, **qp;
180 	int err = -ENOENT;
181 
182 	write_lock(&qdisc_mod_lock);
183 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
184 		if (q == qops)
185 			break;
186 	if (q) {
187 		*qp = q->next;
188 		q->next = NULL;
189 		err = 0;
190 	}
191 	write_unlock(&qdisc_mod_lock);
192 	return err;
193 }
194 EXPORT_SYMBOL(unregister_qdisc);
195 
196 /* Get default qdisc if not otherwise specified */
197 void qdisc_get_default(char *name, size_t len)
198 {
199 	read_lock(&qdisc_mod_lock);
200 	strlcpy(name, default_qdisc_ops->id, len);
201 	read_unlock(&qdisc_mod_lock);
202 }
203 
204 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
205 {
206 	struct Qdisc_ops *q = NULL;
207 
208 	for (q = qdisc_base; q; q = q->next) {
209 		if (!strcmp(name, q->id)) {
210 			if (!try_module_get(q->owner))
211 				q = NULL;
212 			break;
213 		}
214 	}
215 
216 	return q;
217 }
218 
219 /* Set new default qdisc to use */
220 int qdisc_set_default(const char *name)
221 {
222 	const struct Qdisc_ops *ops;
223 
224 	if (!capable(CAP_NET_ADMIN))
225 		return -EPERM;
226 
227 	write_lock(&qdisc_mod_lock);
228 	ops = qdisc_lookup_default(name);
229 	if (!ops) {
230 		/* Not found, drop lock and try to load module */
231 		write_unlock(&qdisc_mod_lock);
232 		request_module("sch_%s", name);
233 		write_lock(&qdisc_mod_lock);
234 
235 		ops = qdisc_lookup_default(name);
236 	}
237 
238 	if (ops) {
239 		/* Set new default */
240 		module_put(default_qdisc_ops->owner);
241 		default_qdisc_ops = ops;
242 	}
243 	write_unlock(&qdisc_mod_lock);
244 
245 	return ops ? 0 : -ENOENT;
246 }
247 
248 #ifdef CONFIG_NET_SCH_DEFAULT
249 /* Set default value from kernel config */
250 static int __init sch_default_qdisc(void)
251 {
252 	return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
253 }
254 late_initcall(sch_default_qdisc);
255 #endif
256 
257 /* We know handle. Find qdisc among all qdisc's attached to device
258  * (root qdisc, all its children, children of children etc.)
259  * Note: caller either uses rtnl or rcu_read_lock()
260  */
261 
262 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
263 {
264 	struct Qdisc *q;
265 
266 	if (!qdisc_dev(root))
267 		return (root->handle == handle ? root : NULL);
268 
269 	if (!(root->flags & TCQ_F_BUILTIN) &&
270 	    root->handle == handle)
271 		return root;
272 
273 	hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
274 		if (q->handle == handle)
275 			return q;
276 	}
277 	return NULL;
278 }
279 
280 void qdisc_hash_add(struct Qdisc *q, bool invisible)
281 {
282 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
283 		ASSERT_RTNL();
284 		hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
285 		if (invisible)
286 			q->flags |= TCQ_F_INVISIBLE;
287 	}
288 }
289 EXPORT_SYMBOL(qdisc_hash_add);
290 
291 void qdisc_hash_del(struct Qdisc *q)
292 {
293 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
294 		ASSERT_RTNL();
295 		hash_del_rcu(&q->hash);
296 	}
297 }
298 EXPORT_SYMBOL(qdisc_hash_del);
299 
300 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
301 {
302 	struct Qdisc *q;
303 
304 	if (!handle)
305 		return NULL;
306 	q = qdisc_match_from_root(dev->qdisc, handle);
307 	if (q)
308 		goto out;
309 
310 	if (dev_ingress_queue(dev))
311 		q = qdisc_match_from_root(
312 			dev_ingress_queue(dev)->qdisc_sleeping,
313 			handle);
314 out:
315 	return q;
316 }
317 
318 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
319 {
320 	unsigned long cl;
321 	struct Qdisc *leaf;
322 	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
323 
324 	if (cops == NULL)
325 		return NULL;
326 	cl = cops->find(p, classid);
327 
328 	if (cl == 0)
329 		return NULL;
330 	leaf = cops->leaf(p, cl);
331 	return leaf;
332 }
333 
334 /* Find queueing discipline by name */
335 
336 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
337 {
338 	struct Qdisc_ops *q = NULL;
339 
340 	if (kind) {
341 		read_lock(&qdisc_mod_lock);
342 		for (q = qdisc_base; q; q = q->next) {
343 			if (nla_strcmp(kind, q->id) == 0) {
344 				if (!try_module_get(q->owner))
345 					q = NULL;
346 				break;
347 			}
348 		}
349 		read_unlock(&qdisc_mod_lock);
350 	}
351 	return q;
352 }
353 
354 /* The linklayer setting were not transferred from iproute2, in older
355  * versions, and the rate tables lookup systems have been dropped in
356  * the kernel. To keep backward compatible with older iproute2 tc
357  * utils, we detect the linklayer setting by detecting if the rate
358  * table were modified.
359  *
360  * For linklayer ATM table entries, the rate table will be aligned to
361  * 48 bytes, thus some table entries will contain the same value.  The
362  * mpu (min packet unit) is also encoded into the old rate table, thus
363  * starting from the mpu, we find low and high table entries for
364  * mapping this cell.  If these entries contain the same value, when
365  * the rate tables have been modified for linklayer ATM.
366  *
367  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
368  * and then roundup to the next cell, calc the table entry one below,
369  * and compare.
370  */
371 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
372 {
373 	int low       = roundup(r->mpu, 48);
374 	int high      = roundup(low+1, 48);
375 	int cell_low  = low >> r->cell_log;
376 	int cell_high = (high >> r->cell_log) - 1;
377 
378 	/* rtab is too inaccurate at rates > 100Mbit/s */
379 	if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
380 		pr_debug("TC linklayer: Giving up ATM detection\n");
381 		return TC_LINKLAYER_ETHERNET;
382 	}
383 
384 	if ((cell_high > cell_low) && (cell_high < 256)
385 	    && (rtab[cell_low] == rtab[cell_high])) {
386 		pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
387 			 cell_low, cell_high, rtab[cell_high]);
388 		return TC_LINKLAYER_ATM;
389 	}
390 	return TC_LINKLAYER_ETHERNET;
391 }
392 
393 static struct qdisc_rate_table *qdisc_rtab_list;
394 
395 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
396 					struct nlattr *tab,
397 					struct netlink_ext_ack *extack)
398 {
399 	struct qdisc_rate_table *rtab;
400 
401 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
402 	    nla_len(tab) != TC_RTAB_SIZE) {
403 		NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
404 		return NULL;
405 	}
406 
407 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
408 		if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
409 		    !memcmp(&rtab->data, nla_data(tab), 1024)) {
410 			rtab->refcnt++;
411 			return rtab;
412 		}
413 	}
414 
415 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
416 	if (rtab) {
417 		rtab->rate = *r;
418 		rtab->refcnt = 1;
419 		memcpy(rtab->data, nla_data(tab), 1024);
420 		if (r->linklayer == TC_LINKLAYER_UNAWARE)
421 			r->linklayer = __detect_linklayer(r, rtab->data);
422 		rtab->next = qdisc_rtab_list;
423 		qdisc_rtab_list = rtab;
424 	} else {
425 		NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
426 	}
427 	return rtab;
428 }
429 EXPORT_SYMBOL(qdisc_get_rtab);
430 
431 void qdisc_put_rtab(struct qdisc_rate_table *tab)
432 {
433 	struct qdisc_rate_table *rtab, **rtabp;
434 
435 	if (!tab || --tab->refcnt)
436 		return;
437 
438 	for (rtabp = &qdisc_rtab_list;
439 	     (rtab = *rtabp) != NULL;
440 	     rtabp = &rtab->next) {
441 		if (rtab == tab) {
442 			*rtabp = rtab->next;
443 			kfree(rtab);
444 			return;
445 		}
446 	}
447 }
448 EXPORT_SYMBOL(qdisc_put_rtab);
449 
450 static LIST_HEAD(qdisc_stab_list);
451 
452 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
453 	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
454 	[TCA_STAB_DATA] = { .type = NLA_BINARY },
455 };
456 
457 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
458 					       struct netlink_ext_ack *extack)
459 {
460 	struct nlattr *tb[TCA_STAB_MAX + 1];
461 	struct qdisc_size_table *stab;
462 	struct tc_sizespec *s;
463 	unsigned int tsize = 0;
464 	u16 *tab = NULL;
465 	int err;
466 
467 	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy, extack);
468 	if (err < 0)
469 		return ERR_PTR(err);
470 	if (!tb[TCA_STAB_BASE]) {
471 		NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
472 		return ERR_PTR(-EINVAL);
473 	}
474 
475 	s = nla_data(tb[TCA_STAB_BASE]);
476 
477 	if (s->tsize > 0) {
478 		if (!tb[TCA_STAB_DATA]) {
479 			NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
480 			return ERR_PTR(-EINVAL);
481 		}
482 		tab = nla_data(tb[TCA_STAB_DATA]);
483 		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
484 	}
485 
486 	if (tsize != s->tsize || (!tab && tsize > 0)) {
487 		NL_SET_ERR_MSG(extack, "Invalid size of size table");
488 		return ERR_PTR(-EINVAL);
489 	}
490 
491 	list_for_each_entry(stab, &qdisc_stab_list, list) {
492 		if (memcmp(&stab->szopts, s, sizeof(*s)))
493 			continue;
494 		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
495 			continue;
496 		stab->refcnt++;
497 		return stab;
498 	}
499 
500 	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
501 	if (!stab)
502 		return ERR_PTR(-ENOMEM);
503 
504 	stab->refcnt = 1;
505 	stab->szopts = *s;
506 	if (tsize > 0)
507 		memcpy(stab->data, tab, tsize * sizeof(u16));
508 
509 	list_add_tail(&stab->list, &qdisc_stab_list);
510 
511 	return stab;
512 }
513 
514 static void stab_kfree_rcu(struct rcu_head *head)
515 {
516 	kfree(container_of(head, struct qdisc_size_table, rcu));
517 }
518 
519 void qdisc_put_stab(struct qdisc_size_table *tab)
520 {
521 	if (!tab)
522 		return;
523 
524 	if (--tab->refcnt == 0) {
525 		list_del(&tab->list);
526 		call_rcu_bh(&tab->rcu, stab_kfree_rcu);
527 	}
528 }
529 EXPORT_SYMBOL(qdisc_put_stab);
530 
531 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
532 {
533 	struct nlattr *nest;
534 
535 	nest = nla_nest_start(skb, TCA_STAB);
536 	if (nest == NULL)
537 		goto nla_put_failure;
538 	if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
539 		goto nla_put_failure;
540 	nla_nest_end(skb, nest);
541 
542 	return skb->len;
543 
544 nla_put_failure:
545 	return -1;
546 }
547 
548 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
549 			       const struct qdisc_size_table *stab)
550 {
551 	int pkt_len, slot;
552 
553 	pkt_len = skb->len + stab->szopts.overhead;
554 	if (unlikely(!stab->szopts.tsize))
555 		goto out;
556 
557 	slot = pkt_len + stab->szopts.cell_align;
558 	if (unlikely(slot < 0))
559 		slot = 0;
560 
561 	slot >>= stab->szopts.cell_log;
562 	if (likely(slot < stab->szopts.tsize))
563 		pkt_len = stab->data[slot];
564 	else
565 		pkt_len = stab->data[stab->szopts.tsize - 1] *
566 				(slot / stab->szopts.tsize) +
567 				stab->data[slot % stab->szopts.tsize];
568 
569 	pkt_len <<= stab->szopts.size_log;
570 out:
571 	if (unlikely(pkt_len < 1))
572 		pkt_len = 1;
573 	qdisc_skb_cb(skb)->pkt_len = pkt_len;
574 }
575 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
576 
577 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
578 {
579 	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
580 		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
581 			txt, qdisc->ops->id, qdisc->handle >> 16);
582 		qdisc->flags |= TCQ_F_WARN_NONWC;
583 	}
584 }
585 EXPORT_SYMBOL(qdisc_warn_nonwc);
586 
587 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
588 {
589 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
590 						 timer);
591 
592 	rcu_read_lock();
593 	__netif_schedule(qdisc_root(wd->qdisc));
594 	rcu_read_unlock();
595 
596 	return HRTIMER_NORESTART;
597 }
598 
599 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
600 {
601 	hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
602 	wd->timer.function = qdisc_watchdog;
603 	wd->qdisc = qdisc;
604 }
605 EXPORT_SYMBOL(qdisc_watchdog_init);
606 
607 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
608 {
609 	if (test_bit(__QDISC_STATE_DEACTIVATED,
610 		     &qdisc_root_sleeping(wd->qdisc)->state))
611 		return;
612 
613 	if (wd->last_expires == expires)
614 		return;
615 
616 	wd->last_expires = expires;
617 	hrtimer_start(&wd->timer,
618 		      ns_to_ktime(expires),
619 		      HRTIMER_MODE_ABS_PINNED);
620 }
621 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
622 
623 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
624 {
625 	hrtimer_cancel(&wd->timer);
626 }
627 EXPORT_SYMBOL(qdisc_watchdog_cancel);
628 
629 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
630 {
631 	struct hlist_head *h;
632 	unsigned int i;
633 
634 	h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
635 
636 	if (h != NULL) {
637 		for (i = 0; i < n; i++)
638 			INIT_HLIST_HEAD(&h[i]);
639 	}
640 	return h;
641 }
642 
643 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
644 {
645 	struct Qdisc_class_common *cl;
646 	struct hlist_node *next;
647 	struct hlist_head *nhash, *ohash;
648 	unsigned int nsize, nmask, osize;
649 	unsigned int i, h;
650 
651 	/* Rehash when load factor exceeds 0.75 */
652 	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
653 		return;
654 	nsize = clhash->hashsize * 2;
655 	nmask = nsize - 1;
656 	nhash = qdisc_class_hash_alloc(nsize);
657 	if (nhash == NULL)
658 		return;
659 
660 	ohash = clhash->hash;
661 	osize = clhash->hashsize;
662 
663 	sch_tree_lock(sch);
664 	for (i = 0; i < osize; i++) {
665 		hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
666 			h = qdisc_class_hash(cl->classid, nmask);
667 			hlist_add_head(&cl->hnode, &nhash[h]);
668 		}
669 	}
670 	clhash->hash     = nhash;
671 	clhash->hashsize = nsize;
672 	clhash->hashmask = nmask;
673 	sch_tree_unlock(sch);
674 
675 	kvfree(ohash);
676 }
677 EXPORT_SYMBOL(qdisc_class_hash_grow);
678 
679 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
680 {
681 	unsigned int size = 4;
682 
683 	clhash->hash = qdisc_class_hash_alloc(size);
684 	if (!clhash->hash)
685 		return -ENOMEM;
686 	clhash->hashsize  = size;
687 	clhash->hashmask  = size - 1;
688 	clhash->hashelems = 0;
689 	return 0;
690 }
691 EXPORT_SYMBOL(qdisc_class_hash_init);
692 
693 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
694 {
695 	kvfree(clhash->hash);
696 }
697 EXPORT_SYMBOL(qdisc_class_hash_destroy);
698 
699 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
700 			     struct Qdisc_class_common *cl)
701 {
702 	unsigned int h;
703 
704 	INIT_HLIST_NODE(&cl->hnode);
705 	h = qdisc_class_hash(cl->classid, clhash->hashmask);
706 	hlist_add_head(&cl->hnode, &clhash->hash[h]);
707 	clhash->hashelems++;
708 }
709 EXPORT_SYMBOL(qdisc_class_hash_insert);
710 
711 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
712 			     struct Qdisc_class_common *cl)
713 {
714 	hlist_del(&cl->hnode);
715 	clhash->hashelems--;
716 }
717 EXPORT_SYMBOL(qdisc_class_hash_remove);
718 
719 /* Allocate an unique handle from space managed by kernel
720  * Possible range is [8000-FFFF]:0000 (0x8000 values)
721  */
722 static u32 qdisc_alloc_handle(struct net_device *dev)
723 {
724 	int i = 0x8000;
725 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
726 
727 	do {
728 		autohandle += TC_H_MAKE(0x10000U, 0);
729 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
730 			autohandle = TC_H_MAKE(0x80000000U, 0);
731 		if (!qdisc_lookup(dev, autohandle))
732 			return autohandle;
733 		cond_resched();
734 	} while	(--i > 0);
735 
736 	return 0;
737 }
738 
739 void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
740 			       unsigned int len)
741 {
742 	const struct Qdisc_class_ops *cops;
743 	unsigned long cl;
744 	u32 parentid;
745 	bool notify;
746 	int drops;
747 
748 	if (n == 0 && len == 0)
749 		return;
750 	drops = max_t(int, n, 0);
751 	rcu_read_lock();
752 	while ((parentid = sch->parent)) {
753 		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
754 			break;
755 
756 		if (sch->flags & TCQ_F_NOPARENT)
757 			break;
758 		/* Notify parent qdisc only if child qdisc becomes empty.
759 		 *
760 		 * If child was empty even before update then backlog
761 		 * counter is screwed and we skip notification because
762 		 * parent class is already passive.
763 		 */
764 		notify = !sch->q.qlen && !WARN_ON_ONCE(!n);
765 		/* TODO: perform the search on a per txq basis */
766 		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
767 		if (sch == NULL) {
768 			WARN_ON_ONCE(parentid != TC_H_ROOT);
769 			break;
770 		}
771 		cops = sch->ops->cl_ops;
772 		if (notify && cops->qlen_notify) {
773 			cl = cops->find(sch, parentid);
774 			cops->qlen_notify(sch, cl);
775 		}
776 		sch->q.qlen -= n;
777 		sch->qstats.backlog -= len;
778 		__qdisc_qstats_drop(sch, drops);
779 	}
780 	rcu_read_unlock();
781 }
782 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
783 
784 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
785 			 u32 portid, u32 seq, u16 flags, int event)
786 {
787 	struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
788 	struct gnet_stats_queue __percpu *cpu_qstats = NULL;
789 	struct tcmsg *tcm;
790 	struct nlmsghdr  *nlh;
791 	unsigned char *b = skb_tail_pointer(skb);
792 	struct gnet_dump d;
793 	struct qdisc_size_table *stab;
794 	__u32 qlen;
795 
796 	cond_resched();
797 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
798 	if (!nlh)
799 		goto out_nlmsg_trim;
800 	tcm = nlmsg_data(nlh);
801 	tcm->tcm_family = AF_UNSPEC;
802 	tcm->tcm__pad1 = 0;
803 	tcm->tcm__pad2 = 0;
804 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
805 	tcm->tcm_parent = clid;
806 	tcm->tcm_handle = q->handle;
807 	tcm->tcm_info = refcount_read(&q->refcnt);
808 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
809 		goto nla_put_failure;
810 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
811 		goto nla_put_failure;
812 	if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
813 		goto nla_put_failure;
814 	qlen = qdisc_qlen_sum(q);
815 
816 	stab = rtnl_dereference(q->stab);
817 	if (stab && qdisc_dump_stab(skb, stab) < 0)
818 		goto nla_put_failure;
819 
820 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
821 					 NULL, &d, TCA_PAD) < 0)
822 		goto nla_put_failure;
823 
824 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
825 		goto nla_put_failure;
826 
827 	if (qdisc_is_percpu_stats(q)) {
828 		cpu_bstats = q->cpu_bstats;
829 		cpu_qstats = q->cpu_qstats;
830 	}
831 
832 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
833 				  &d, cpu_bstats, &q->bstats) < 0 ||
834 	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
835 	    gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
836 		goto nla_put_failure;
837 
838 	if (gnet_stats_finish_copy(&d) < 0)
839 		goto nla_put_failure;
840 
841 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
842 	return skb->len;
843 
844 out_nlmsg_trim:
845 nla_put_failure:
846 	nlmsg_trim(skb, b);
847 	return -1;
848 }
849 
850 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
851 {
852 	if (q->flags & TCQ_F_BUILTIN)
853 		return true;
854 	if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
855 		return true;
856 
857 	return false;
858 }
859 
860 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
861 			struct nlmsghdr *n, u32 clid,
862 			struct Qdisc *old, struct Qdisc *new)
863 {
864 	struct sk_buff *skb;
865 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
866 
867 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
868 	if (!skb)
869 		return -ENOBUFS;
870 
871 	if (old && !tc_qdisc_dump_ignore(old, false)) {
872 		if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
873 				  0, RTM_DELQDISC) < 0)
874 			goto err_out;
875 	}
876 	if (new && !tc_qdisc_dump_ignore(new, false)) {
877 		if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
878 				  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
879 			goto err_out;
880 	}
881 
882 	if (skb->len)
883 		return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
884 				      n->nlmsg_flags & NLM_F_ECHO);
885 
886 err_out:
887 	kfree_skb(skb);
888 	return -EINVAL;
889 }
890 
891 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
892 			       struct nlmsghdr *n, u32 clid,
893 			       struct Qdisc *old, struct Qdisc *new)
894 {
895 	if (new || old)
896 		qdisc_notify(net, skb, n, clid, old, new);
897 
898 	if (old)
899 		qdisc_destroy(old);
900 }
901 
902 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
903  * to device "dev".
904  *
905  * When appropriate send a netlink notification using 'skb'
906  * and "n".
907  *
908  * On success, destroy old qdisc.
909  */
910 
911 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
912 		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
913 		       struct Qdisc *new, struct Qdisc *old,
914 		       struct netlink_ext_ack *extack)
915 {
916 	struct Qdisc *q = old;
917 	struct net *net = dev_net(dev);
918 	int err = 0;
919 
920 	if (parent == NULL) {
921 		unsigned int i, num_q, ingress;
922 
923 		ingress = 0;
924 		num_q = dev->num_tx_queues;
925 		if ((q && q->flags & TCQ_F_INGRESS) ||
926 		    (new && new->flags & TCQ_F_INGRESS)) {
927 			num_q = 1;
928 			ingress = 1;
929 			if (!dev_ingress_queue(dev)) {
930 				NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
931 				return -ENOENT;
932 			}
933 		}
934 
935 		if (dev->flags & IFF_UP)
936 			dev_deactivate(dev);
937 
938 		if (new && new->ops->attach)
939 			goto skip;
940 
941 		for (i = 0; i < num_q; i++) {
942 			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
943 
944 			if (!ingress)
945 				dev_queue = netdev_get_tx_queue(dev, i);
946 
947 			old = dev_graft_qdisc(dev_queue, new);
948 			if (new && i > 0)
949 				qdisc_refcount_inc(new);
950 
951 			if (!ingress)
952 				qdisc_destroy(old);
953 		}
954 
955 skip:
956 		if (!ingress) {
957 			notify_and_destroy(net, skb, n, classid,
958 					   dev->qdisc, new);
959 			if (new && !new->ops->attach)
960 				qdisc_refcount_inc(new);
961 			dev->qdisc = new ? : &noop_qdisc;
962 
963 			if (new && new->ops->attach)
964 				new->ops->attach(new);
965 		} else {
966 			notify_and_destroy(net, skb, n, classid, old, new);
967 		}
968 
969 		if (dev->flags & IFF_UP)
970 			dev_activate(dev);
971 	} else {
972 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
973 
974 		/* Only support running class lockless if parent is lockless */
975 		if (new && (new->flags & TCQ_F_NOLOCK) &&
976 		    parent && !(parent->flags & TCQ_F_NOLOCK))
977 			new->flags &= ~TCQ_F_NOLOCK;
978 
979 		err = -EOPNOTSUPP;
980 		if (cops && cops->graft) {
981 			unsigned long cl = cops->find(parent, classid);
982 
983 			if (cl) {
984 				err = cops->graft(parent, cl, new, &old,
985 						  extack);
986 			} else {
987 				NL_SET_ERR_MSG(extack, "Specified class not found");
988 				err = -ENOENT;
989 			}
990 		}
991 		if (!err)
992 			notify_and_destroy(net, skb, n, classid, old, new);
993 	}
994 	return err;
995 }
996 
997 /* lockdep annotation is needed for ingress; egress gets it only for name */
998 static struct lock_class_key qdisc_tx_lock;
999 static struct lock_class_key qdisc_rx_lock;
1000 
1001 /*
1002    Allocate and initialize new qdisc.
1003 
1004    Parameters are passed via opt.
1005  */
1006 
1007 static struct Qdisc *qdisc_create(struct net_device *dev,
1008 				  struct netdev_queue *dev_queue,
1009 				  struct Qdisc *p, u32 parent, u32 handle,
1010 				  struct nlattr **tca, int *errp,
1011 				  struct netlink_ext_ack *extack)
1012 {
1013 	int err;
1014 	struct nlattr *kind = tca[TCA_KIND];
1015 	struct Qdisc *sch;
1016 	struct Qdisc_ops *ops;
1017 	struct qdisc_size_table *stab;
1018 
1019 	ops = qdisc_lookup_ops(kind);
1020 #ifdef CONFIG_MODULES
1021 	if (ops == NULL && kind != NULL) {
1022 		char name[IFNAMSIZ];
1023 		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1024 			/* We dropped the RTNL semaphore in order to
1025 			 * perform the module load.  So, even if we
1026 			 * succeeded in loading the module we have to
1027 			 * tell the caller to replay the request.  We
1028 			 * indicate this using -EAGAIN.
1029 			 * We replay the request because the device may
1030 			 * go away in the mean time.
1031 			 */
1032 			rtnl_unlock();
1033 			request_module("sch_%s", name);
1034 			rtnl_lock();
1035 			ops = qdisc_lookup_ops(kind);
1036 			if (ops != NULL) {
1037 				/* We will try again qdisc_lookup_ops,
1038 				 * so don't keep a reference.
1039 				 */
1040 				module_put(ops->owner);
1041 				err = -EAGAIN;
1042 				goto err_out;
1043 			}
1044 		}
1045 	}
1046 #endif
1047 
1048 	err = -ENOENT;
1049 	if (!ops) {
1050 		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1051 		goto err_out;
1052 	}
1053 
1054 	sch = qdisc_alloc(dev_queue, ops, extack);
1055 	if (IS_ERR(sch)) {
1056 		err = PTR_ERR(sch);
1057 		goto err_out2;
1058 	}
1059 
1060 	sch->parent = parent;
1061 
1062 	if (handle == TC_H_INGRESS) {
1063 		sch->flags |= TCQ_F_INGRESS;
1064 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
1065 		lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
1066 	} else {
1067 		if (handle == 0) {
1068 			handle = qdisc_alloc_handle(dev);
1069 			err = -ENOMEM;
1070 			if (handle == 0)
1071 				goto err_out3;
1072 		}
1073 		lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
1074 		if (!netif_is_multiqueue(dev))
1075 			sch->flags |= TCQ_F_ONETXQUEUE;
1076 	}
1077 
1078 	sch->handle = handle;
1079 
1080 	/* This exist to keep backward compatible with a userspace
1081 	 * loophole, what allowed userspace to get IFF_NO_QUEUE
1082 	 * facility on older kernels by setting tx_queue_len=0 (prior
1083 	 * to qdisc init), and then forgot to reinit tx_queue_len
1084 	 * before again attaching a qdisc.
1085 	 */
1086 	if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1087 		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1088 		netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1089 	}
1090 
1091 	if (ops->init) {
1092 		err = ops->init(sch, tca[TCA_OPTIONS], extack);
1093 		if (err != 0)
1094 			goto err_out5;
1095 	}
1096 
1097 	if (qdisc_is_percpu_stats(sch)) {
1098 		sch->cpu_bstats =
1099 			netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
1100 		if (!sch->cpu_bstats)
1101 			goto err_out4;
1102 
1103 		sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
1104 		if (!sch->cpu_qstats)
1105 			goto err_out4;
1106 	}
1107 
1108 	if (tca[TCA_STAB]) {
1109 		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1110 		if (IS_ERR(stab)) {
1111 			err = PTR_ERR(stab);
1112 			goto err_out4;
1113 		}
1114 		rcu_assign_pointer(sch->stab, stab);
1115 	}
1116 	if (tca[TCA_RATE]) {
1117 		seqcount_t *running;
1118 
1119 		err = -EOPNOTSUPP;
1120 		if (sch->flags & TCQ_F_MQROOT) {
1121 			NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1122 			goto err_out4;
1123 		}
1124 
1125 		if (sch->parent != TC_H_ROOT &&
1126 		    !(sch->flags & TCQ_F_INGRESS) &&
1127 		    (!p || !(p->flags & TCQ_F_MQROOT)))
1128 			running = qdisc_root_sleeping_running(sch);
1129 		else
1130 			running = &sch->running;
1131 
1132 		err = gen_new_estimator(&sch->bstats,
1133 					sch->cpu_bstats,
1134 					&sch->rate_est,
1135 					NULL,
1136 					running,
1137 					tca[TCA_RATE]);
1138 		if (err) {
1139 			NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1140 			goto err_out4;
1141 		}
1142 	}
1143 
1144 	qdisc_hash_add(sch, false);
1145 
1146 	return sch;
1147 
1148 err_out5:
1149 	/* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1150 	if (ops->destroy)
1151 		ops->destroy(sch);
1152 err_out3:
1153 	dev_put(dev);
1154 	kfree((char *) sch - sch->padded);
1155 err_out2:
1156 	module_put(ops->owner);
1157 err_out:
1158 	*errp = err;
1159 	return NULL;
1160 
1161 err_out4:
1162 	free_percpu(sch->cpu_bstats);
1163 	free_percpu(sch->cpu_qstats);
1164 	/*
1165 	 * Any broken qdiscs that would require a ops->reset() here?
1166 	 * The qdisc was never in action so it shouldn't be necessary.
1167 	 */
1168 	qdisc_put_stab(rtnl_dereference(sch->stab));
1169 	if (ops->destroy)
1170 		ops->destroy(sch);
1171 	goto err_out3;
1172 }
1173 
1174 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1175 			struct netlink_ext_ack *extack)
1176 {
1177 	struct qdisc_size_table *ostab, *stab = NULL;
1178 	int err = 0;
1179 
1180 	if (tca[TCA_OPTIONS]) {
1181 		if (!sch->ops->change) {
1182 			NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1183 			return -EINVAL;
1184 		}
1185 		err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1186 		if (err)
1187 			return err;
1188 	}
1189 
1190 	if (tca[TCA_STAB]) {
1191 		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1192 		if (IS_ERR(stab))
1193 			return PTR_ERR(stab);
1194 	}
1195 
1196 	ostab = rtnl_dereference(sch->stab);
1197 	rcu_assign_pointer(sch->stab, stab);
1198 	qdisc_put_stab(ostab);
1199 
1200 	if (tca[TCA_RATE]) {
1201 		/* NB: ignores errors from replace_estimator
1202 		   because change can't be undone. */
1203 		if (sch->flags & TCQ_F_MQROOT)
1204 			goto out;
1205 		gen_replace_estimator(&sch->bstats,
1206 				      sch->cpu_bstats,
1207 				      &sch->rate_est,
1208 				      NULL,
1209 				      qdisc_root_sleeping_running(sch),
1210 				      tca[TCA_RATE]);
1211 	}
1212 out:
1213 	return 0;
1214 }
1215 
1216 struct check_loop_arg {
1217 	struct qdisc_walker	w;
1218 	struct Qdisc		*p;
1219 	int			depth;
1220 };
1221 
1222 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1223 			 struct qdisc_walker *w);
1224 
1225 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1226 {
1227 	struct check_loop_arg	arg;
1228 
1229 	if (q->ops->cl_ops == NULL)
1230 		return 0;
1231 
1232 	arg.w.stop = arg.w.skip = arg.w.count = 0;
1233 	arg.w.fn = check_loop_fn;
1234 	arg.depth = depth;
1235 	arg.p = p;
1236 	q->ops->cl_ops->walk(q, &arg.w);
1237 	return arg.w.stop ? -ELOOP : 0;
1238 }
1239 
1240 static int
1241 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1242 {
1243 	struct Qdisc *leaf;
1244 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1245 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
1246 
1247 	leaf = cops->leaf(q, cl);
1248 	if (leaf) {
1249 		if (leaf == arg->p || arg->depth > 7)
1250 			return -ELOOP;
1251 		return check_loop(leaf, arg->p, arg->depth + 1);
1252 	}
1253 	return 0;
1254 }
1255 
1256 /*
1257  * Delete/get qdisc.
1258  */
1259 
1260 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1261 			struct netlink_ext_ack *extack)
1262 {
1263 	struct net *net = sock_net(skb->sk);
1264 	struct tcmsg *tcm = nlmsg_data(n);
1265 	struct nlattr *tca[TCA_MAX + 1];
1266 	struct net_device *dev;
1267 	u32 clid;
1268 	struct Qdisc *q = NULL;
1269 	struct Qdisc *p = NULL;
1270 	int err;
1271 
1272 	if ((n->nlmsg_type != RTM_GETQDISC) &&
1273 	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1274 		return -EPERM;
1275 
1276 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
1277 	if (err < 0)
1278 		return err;
1279 
1280 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1281 	if (!dev)
1282 		return -ENODEV;
1283 
1284 	clid = tcm->tcm_parent;
1285 	if (clid) {
1286 		if (clid != TC_H_ROOT) {
1287 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1288 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1289 				if (!p) {
1290 					NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1291 					return -ENOENT;
1292 				}
1293 				q = qdisc_leaf(p, clid);
1294 			} else if (dev_ingress_queue(dev)) {
1295 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1296 			}
1297 		} else {
1298 			q = dev->qdisc;
1299 		}
1300 		if (!q) {
1301 			NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1302 			return -ENOENT;
1303 		}
1304 
1305 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1306 			NL_SET_ERR_MSG(extack, "Invalid handle");
1307 			return -EINVAL;
1308 		}
1309 	} else {
1310 		q = qdisc_lookup(dev, tcm->tcm_handle);
1311 		if (!q) {
1312 			NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1313 			return -ENOENT;
1314 		}
1315 	}
1316 
1317 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1318 		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1319 		return -EINVAL;
1320 	}
1321 
1322 	if (n->nlmsg_type == RTM_DELQDISC) {
1323 		if (!clid) {
1324 			NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1325 			return -EINVAL;
1326 		}
1327 		if (q->handle == 0) {
1328 			NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1329 			return -ENOENT;
1330 		}
1331 		err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1332 		if (err != 0)
1333 			return err;
1334 	} else {
1335 		qdisc_notify(net, skb, n, clid, NULL, q);
1336 	}
1337 	return 0;
1338 }
1339 
1340 /*
1341  * Create/change qdisc.
1342  */
1343 
1344 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1345 			   struct netlink_ext_ack *extack)
1346 {
1347 	struct net *net = sock_net(skb->sk);
1348 	struct tcmsg *tcm;
1349 	struct nlattr *tca[TCA_MAX + 1];
1350 	struct net_device *dev;
1351 	u32 clid;
1352 	struct Qdisc *q, *p;
1353 	int err;
1354 
1355 	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1356 		return -EPERM;
1357 
1358 replay:
1359 	/* Reinit, just in case something touches this. */
1360 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
1361 	if (err < 0)
1362 		return err;
1363 
1364 	tcm = nlmsg_data(n);
1365 	clid = tcm->tcm_parent;
1366 	q = p = NULL;
1367 
1368 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1369 	if (!dev)
1370 		return -ENODEV;
1371 
1372 
1373 	if (clid) {
1374 		if (clid != TC_H_ROOT) {
1375 			if (clid != TC_H_INGRESS) {
1376 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1377 				if (!p) {
1378 					NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1379 					return -ENOENT;
1380 				}
1381 				q = qdisc_leaf(p, clid);
1382 			} else if (dev_ingress_queue_create(dev)) {
1383 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1384 			}
1385 		} else {
1386 			q = dev->qdisc;
1387 		}
1388 
1389 		/* It may be default qdisc, ignore it */
1390 		if (q && q->handle == 0)
1391 			q = NULL;
1392 
1393 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1394 			if (tcm->tcm_handle) {
1395 				if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1396 					NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1397 					return -EEXIST;
1398 				}
1399 				if (TC_H_MIN(tcm->tcm_handle)) {
1400 					NL_SET_ERR_MSG(extack, "Invalid minor handle");
1401 					return -EINVAL;
1402 				}
1403 				q = qdisc_lookup(dev, tcm->tcm_handle);
1404 				if (!q)
1405 					goto create_n_graft;
1406 				if (n->nlmsg_flags & NLM_F_EXCL) {
1407 					NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1408 					return -EEXIST;
1409 				}
1410 				if (tca[TCA_KIND] &&
1411 				    nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1412 					NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1413 					return -EINVAL;
1414 				}
1415 				if (q == p ||
1416 				    (p && check_loop(q, p, 0))) {
1417 					NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1418 					return -ELOOP;
1419 				}
1420 				qdisc_refcount_inc(q);
1421 				goto graft;
1422 			} else {
1423 				if (!q)
1424 					goto create_n_graft;
1425 
1426 				/* This magic test requires explanation.
1427 				 *
1428 				 *   We know, that some child q is already
1429 				 *   attached to this parent and have choice:
1430 				 *   either to change it or to create/graft new one.
1431 				 *
1432 				 *   1. We are allowed to create/graft only
1433 				 *   if CREATE and REPLACE flags are set.
1434 				 *
1435 				 *   2. If EXCL is set, requestor wanted to say,
1436 				 *   that qdisc tcm_handle is not expected
1437 				 *   to exist, so that we choose create/graft too.
1438 				 *
1439 				 *   3. The last case is when no flags are set.
1440 				 *   Alas, it is sort of hole in API, we
1441 				 *   cannot decide what to do unambiguously.
1442 				 *   For now we select create/graft, if
1443 				 *   user gave KIND, which does not match existing.
1444 				 */
1445 				if ((n->nlmsg_flags & NLM_F_CREATE) &&
1446 				    (n->nlmsg_flags & NLM_F_REPLACE) &&
1447 				    ((n->nlmsg_flags & NLM_F_EXCL) ||
1448 				     (tca[TCA_KIND] &&
1449 				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1450 					goto create_n_graft;
1451 			}
1452 		}
1453 	} else {
1454 		if (!tcm->tcm_handle) {
1455 			NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1456 			return -EINVAL;
1457 		}
1458 		q = qdisc_lookup(dev, tcm->tcm_handle);
1459 	}
1460 
1461 	/* Change qdisc parameters */
1462 	if (!q) {
1463 		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1464 		return -ENOENT;
1465 	}
1466 	if (n->nlmsg_flags & NLM_F_EXCL) {
1467 		NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1468 		return -EEXIST;
1469 	}
1470 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1471 		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1472 		return -EINVAL;
1473 	}
1474 	err = qdisc_change(q, tca, extack);
1475 	if (err == 0)
1476 		qdisc_notify(net, skb, n, clid, NULL, q);
1477 	return err;
1478 
1479 create_n_graft:
1480 	if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1481 		NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1482 		return -ENOENT;
1483 	}
1484 	if (clid == TC_H_INGRESS) {
1485 		if (dev_ingress_queue(dev)) {
1486 			q = qdisc_create(dev, dev_ingress_queue(dev), p,
1487 					 tcm->tcm_parent, tcm->tcm_parent,
1488 					 tca, &err, extack);
1489 		} else {
1490 			NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1491 			err = -ENOENT;
1492 		}
1493 	} else {
1494 		struct netdev_queue *dev_queue;
1495 
1496 		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1497 			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1498 		else if (p)
1499 			dev_queue = p->dev_queue;
1500 		else
1501 			dev_queue = netdev_get_tx_queue(dev, 0);
1502 
1503 		q = qdisc_create(dev, dev_queue, p,
1504 				 tcm->tcm_parent, tcm->tcm_handle,
1505 				 tca, &err, extack);
1506 	}
1507 	if (q == NULL) {
1508 		if (err == -EAGAIN)
1509 			goto replay;
1510 		return err;
1511 	}
1512 
1513 graft:
1514 	err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1515 	if (err) {
1516 		if (q)
1517 			qdisc_destroy(q);
1518 		return err;
1519 	}
1520 
1521 	return 0;
1522 }
1523 
1524 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1525 			      struct netlink_callback *cb,
1526 			      int *q_idx_p, int s_q_idx, bool recur,
1527 			      bool dump_invisible)
1528 {
1529 	int ret = 0, q_idx = *q_idx_p;
1530 	struct Qdisc *q;
1531 	int b;
1532 
1533 	if (!root)
1534 		return 0;
1535 
1536 	q = root;
1537 	if (q_idx < s_q_idx) {
1538 		q_idx++;
1539 	} else {
1540 		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1541 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1542 				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1543 				  RTM_NEWQDISC) <= 0)
1544 			goto done;
1545 		q_idx++;
1546 	}
1547 
1548 	/* If dumping singletons, there is no qdisc_dev(root) and the singleton
1549 	 * itself has already been dumped.
1550 	 *
1551 	 * If we've already dumped the top-level (ingress) qdisc above and the global
1552 	 * qdisc hashtable, we don't want to hit it again
1553 	 */
1554 	if (!qdisc_dev(root) || !recur)
1555 		goto out;
1556 
1557 	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1558 		if (q_idx < s_q_idx) {
1559 			q_idx++;
1560 			continue;
1561 		}
1562 		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1563 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1564 				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1565 				  RTM_NEWQDISC) <= 0)
1566 			goto done;
1567 		q_idx++;
1568 	}
1569 
1570 out:
1571 	*q_idx_p = q_idx;
1572 	return ret;
1573 done:
1574 	ret = -1;
1575 	goto out;
1576 }
1577 
1578 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1579 {
1580 	struct net *net = sock_net(skb->sk);
1581 	int idx, q_idx;
1582 	int s_idx, s_q_idx;
1583 	struct net_device *dev;
1584 	const struct nlmsghdr *nlh = cb->nlh;
1585 	struct nlattr *tca[TCA_MAX + 1];
1586 	int err;
1587 
1588 	s_idx = cb->args[0];
1589 	s_q_idx = q_idx = cb->args[1];
1590 
1591 	idx = 0;
1592 	ASSERT_RTNL();
1593 
1594 	err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX, NULL, NULL);
1595 	if (err < 0)
1596 		return err;
1597 
1598 	for_each_netdev(net, dev) {
1599 		struct netdev_queue *dev_queue;
1600 
1601 		if (idx < s_idx)
1602 			goto cont;
1603 		if (idx > s_idx)
1604 			s_q_idx = 0;
1605 		q_idx = 0;
1606 
1607 		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1608 				       true, tca[TCA_DUMP_INVISIBLE]) < 0)
1609 			goto done;
1610 
1611 		dev_queue = dev_ingress_queue(dev);
1612 		if (dev_queue &&
1613 		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1614 				       &q_idx, s_q_idx, false,
1615 				       tca[TCA_DUMP_INVISIBLE]) < 0)
1616 			goto done;
1617 
1618 cont:
1619 		idx++;
1620 	}
1621 
1622 done:
1623 	cb->args[0] = idx;
1624 	cb->args[1] = q_idx;
1625 
1626 	return skb->len;
1627 }
1628 
1629 
1630 
1631 /************************************************
1632  *	Traffic classes manipulation.		*
1633  ************************************************/
1634 
1635 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1636 			  unsigned long cl,
1637 			  u32 portid, u32 seq, u16 flags, int event)
1638 {
1639 	struct tcmsg *tcm;
1640 	struct nlmsghdr  *nlh;
1641 	unsigned char *b = skb_tail_pointer(skb);
1642 	struct gnet_dump d;
1643 	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1644 
1645 	cond_resched();
1646 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1647 	if (!nlh)
1648 		goto out_nlmsg_trim;
1649 	tcm = nlmsg_data(nlh);
1650 	tcm->tcm_family = AF_UNSPEC;
1651 	tcm->tcm__pad1 = 0;
1652 	tcm->tcm__pad2 = 0;
1653 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1654 	tcm->tcm_parent = q->handle;
1655 	tcm->tcm_handle = q->handle;
1656 	tcm->tcm_info = 0;
1657 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
1658 		goto nla_put_failure;
1659 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1660 		goto nla_put_failure;
1661 
1662 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1663 					 NULL, &d, TCA_PAD) < 0)
1664 		goto nla_put_failure;
1665 
1666 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1667 		goto nla_put_failure;
1668 
1669 	if (gnet_stats_finish_copy(&d) < 0)
1670 		goto nla_put_failure;
1671 
1672 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1673 	return skb->len;
1674 
1675 out_nlmsg_trim:
1676 nla_put_failure:
1677 	nlmsg_trim(skb, b);
1678 	return -1;
1679 }
1680 
1681 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1682 			 struct nlmsghdr *n, struct Qdisc *q,
1683 			 unsigned long cl, int event)
1684 {
1685 	struct sk_buff *skb;
1686 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1687 
1688 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1689 	if (!skb)
1690 		return -ENOBUFS;
1691 
1692 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1693 		kfree_skb(skb);
1694 		return -EINVAL;
1695 	}
1696 
1697 	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1698 			      n->nlmsg_flags & NLM_F_ECHO);
1699 }
1700 
1701 static int tclass_del_notify(struct net *net,
1702 			     const struct Qdisc_class_ops *cops,
1703 			     struct sk_buff *oskb, struct nlmsghdr *n,
1704 			     struct Qdisc *q, unsigned long cl)
1705 {
1706 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1707 	struct sk_buff *skb;
1708 	int err = 0;
1709 
1710 	if (!cops->delete)
1711 		return -EOPNOTSUPP;
1712 
1713 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1714 	if (!skb)
1715 		return -ENOBUFS;
1716 
1717 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1718 			   RTM_DELTCLASS) < 0) {
1719 		kfree_skb(skb);
1720 		return -EINVAL;
1721 	}
1722 
1723 	err = cops->delete(q, cl);
1724 	if (err) {
1725 		kfree_skb(skb);
1726 		return err;
1727 	}
1728 
1729 	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1730 			      n->nlmsg_flags & NLM_F_ECHO);
1731 }
1732 
1733 #ifdef CONFIG_NET_CLS
1734 
1735 struct tcf_bind_args {
1736 	struct tcf_walker w;
1737 	u32 classid;
1738 	unsigned long cl;
1739 };
1740 
1741 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1742 {
1743 	struct tcf_bind_args *a = (void *)arg;
1744 
1745 	if (tp->ops->bind_class) {
1746 		struct Qdisc *q = tcf_block_q(tp->chain->block);
1747 
1748 		sch_tree_lock(q);
1749 		tp->ops->bind_class(n, a->classid, a->cl);
1750 		sch_tree_unlock(q);
1751 	}
1752 	return 0;
1753 }
1754 
1755 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1756 			   unsigned long new_cl)
1757 {
1758 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1759 	struct tcf_block *block;
1760 	struct tcf_chain *chain;
1761 	unsigned long cl;
1762 
1763 	cl = cops->find(q, portid);
1764 	if (!cl)
1765 		return;
1766 	block = cops->tcf_block(q, cl, NULL);
1767 	if (!block)
1768 		return;
1769 	list_for_each_entry(chain, &block->chain_list, list) {
1770 		struct tcf_proto *tp;
1771 
1772 		for (tp = rtnl_dereference(chain->filter_chain);
1773 		     tp; tp = rtnl_dereference(tp->next)) {
1774 			struct tcf_bind_args arg = {};
1775 
1776 			arg.w.fn = tcf_node_bind;
1777 			arg.classid = clid;
1778 			arg.cl = new_cl;
1779 			tp->ops->walk(tp, &arg.w);
1780 		}
1781 	}
1782 }
1783 
1784 #else
1785 
1786 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1787 			   unsigned long new_cl)
1788 {
1789 }
1790 
1791 #endif
1792 
1793 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1794 			 struct netlink_ext_ack *extack)
1795 {
1796 	struct net *net = sock_net(skb->sk);
1797 	struct tcmsg *tcm = nlmsg_data(n);
1798 	struct nlattr *tca[TCA_MAX + 1];
1799 	struct net_device *dev;
1800 	struct Qdisc *q = NULL;
1801 	const struct Qdisc_class_ops *cops;
1802 	unsigned long cl = 0;
1803 	unsigned long new_cl;
1804 	u32 portid;
1805 	u32 clid;
1806 	u32 qid;
1807 	int err;
1808 
1809 	if ((n->nlmsg_type != RTM_GETTCLASS) &&
1810 	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1811 		return -EPERM;
1812 
1813 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
1814 	if (err < 0)
1815 		return err;
1816 
1817 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1818 	if (!dev)
1819 		return -ENODEV;
1820 
1821 	/*
1822 	   parent == TC_H_UNSPEC - unspecified parent.
1823 	   parent == TC_H_ROOT   - class is root, which has no parent.
1824 	   parent == X:0	 - parent is root class.
1825 	   parent == X:Y	 - parent is a node in hierarchy.
1826 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
1827 
1828 	   handle == 0:0	 - generate handle from kernel pool.
1829 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
1830 	   handle == X:Y	 - clear.
1831 	   handle == X:0	 - root class.
1832 	 */
1833 
1834 	/* Step 1. Determine qdisc handle X:0 */
1835 
1836 	portid = tcm->tcm_parent;
1837 	clid = tcm->tcm_handle;
1838 	qid = TC_H_MAJ(clid);
1839 
1840 	if (portid != TC_H_ROOT) {
1841 		u32 qid1 = TC_H_MAJ(portid);
1842 
1843 		if (qid && qid1) {
1844 			/* If both majors are known, they must be identical. */
1845 			if (qid != qid1)
1846 				return -EINVAL;
1847 		} else if (qid1) {
1848 			qid = qid1;
1849 		} else if (qid == 0)
1850 			qid = dev->qdisc->handle;
1851 
1852 		/* Now qid is genuine qdisc handle consistent
1853 		 * both with parent and child.
1854 		 *
1855 		 * TC_H_MAJ(portid) still may be unspecified, complete it now.
1856 		 */
1857 		if (portid)
1858 			portid = TC_H_MAKE(qid, portid);
1859 	} else {
1860 		if (qid == 0)
1861 			qid = dev->qdisc->handle;
1862 	}
1863 
1864 	/* OK. Locate qdisc */
1865 	q = qdisc_lookup(dev, qid);
1866 	if (!q)
1867 		return -ENOENT;
1868 
1869 	/* An check that it supports classes */
1870 	cops = q->ops->cl_ops;
1871 	if (cops == NULL)
1872 		return -EINVAL;
1873 
1874 	/* Now try to get class */
1875 	if (clid == 0) {
1876 		if (portid == TC_H_ROOT)
1877 			clid = qid;
1878 	} else
1879 		clid = TC_H_MAKE(qid, clid);
1880 
1881 	if (clid)
1882 		cl = cops->find(q, clid);
1883 
1884 	if (cl == 0) {
1885 		err = -ENOENT;
1886 		if (n->nlmsg_type != RTM_NEWTCLASS ||
1887 		    !(n->nlmsg_flags & NLM_F_CREATE))
1888 			goto out;
1889 	} else {
1890 		switch (n->nlmsg_type) {
1891 		case RTM_NEWTCLASS:
1892 			err = -EEXIST;
1893 			if (n->nlmsg_flags & NLM_F_EXCL)
1894 				goto out;
1895 			break;
1896 		case RTM_DELTCLASS:
1897 			err = tclass_del_notify(net, cops, skb, n, q, cl);
1898 			/* Unbind the class with flilters with 0 */
1899 			tc_bind_tclass(q, portid, clid, 0);
1900 			goto out;
1901 		case RTM_GETTCLASS:
1902 			err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1903 			goto out;
1904 		default:
1905 			err = -EINVAL;
1906 			goto out;
1907 		}
1908 	}
1909 
1910 	new_cl = cl;
1911 	err = -EOPNOTSUPP;
1912 	if (cops->change)
1913 		err = cops->change(q, clid, portid, tca, &new_cl, extack);
1914 	if (err == 0) {
1915 		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1916 		/* We just create a new class, need to do reverse binding. */
1917 		if (cl != new_cl)
1918 			tc_bind_tclass(q, portid, clid, new_cl);
1919 	}
1920 out:
1921 	return err;
1922 }
1923 
1924 struct qdisc_dump_args {
1925 	struct qdisc_walker	w;
1926 	struct sk_buff		*skb;
1927 	struct netlink_callback	*cb;
1928 };
1929 
1930 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
1931 			    struct qdisc_walker *arg)
1932 {
1933 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1934 
1935 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1936 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
1937 			      RTM_NEWTCLASS);
1938 }
1939 
1940 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1941 				struct tcmsg *tcm, struct netlink_callback *cb,
1942 				int *t_p, int s_t)
1943 {
1944 	struct qdisc_dump_args arg;
1945 
1946 	if (tc_qdisc_dump_ignore(q, false) ||
1947 	    *t_p < s_t || !q->ops->cl_ops ||
1948 	    (tcm->tcm_parent &&
1949 	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1950 		(*t_p)++;
1951 		return 0;
1952 	}
1953 	if (*t_p > s_t)
1954 		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1955 	arg.w.fn = qdisc_class_dump;
1956 	arg.skb = skb;
1957 	arg.cb = cb;
1958 	arg.w.stop  = 0;
1959 	arg.w.skip = cb->args[1];
1960 	arg.w.count = 0;
1961 	q->ops->cl_ops->walk(q, &arg.w);
1962 	cb->args[1] = arg.w.count;
1963 	if (arg.w.stop)
1964 		return -1;
1965 	(*t_p)++;
1966 	return 0;
1967 }
1968 
1969 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1970 			       struct tcmsg *tcm, struct netlink_callback *cb,
1971 			       int *t_p, int s_t)
1972 {
1973 	struct Qdisc *q;
1974 	int b;
1975 
1976 	if (!root)
1977 		return 0;
1978 
1979 	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1980 		return -1;
1981 
1982 	if (!qdisc_dev(root))
1983 		return 0;
1984 
1985 	if (tcm->tcm_parent) {
1986 		q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
1987 		if (q && tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1988 			return -1;
1989 		return 0;
1990 	}
1991 	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1992 		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1993 			return -1;
1994 	}
1995 
1996 	return 0;
1997 }
1998 
1999 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2000 {
2001 	struct tcmsg *tcm = nlmsg_data(cb->nlh);
2002 	struct net *net = sock_net(skb->sk);
2003 	struct netdev_queue *dev_queue;
2004 	struct net_device *dev;
2005 	int t, s_t;
2006 
2007 	if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2008 		return 0;
2009 	dev = dev_get_by_index(net, tcm->tcm_ifindex);
2010 	if (!dev)
2011 		return 0;
2012 
2013 	s_t = cb->args[0];
2014 	t = 0;
2015 
2016 	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
2017 		goto done;
2018 
2019 	dev_queue = dev_ingress_queue(dev);
2020 	if (dev_queue &&
2021 	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2022 				&t, s_t) < 0)
2023 		goto done;
2024 
2025 done:
2026 	cb->args[0] = t;
2027 
2028 	dev_put(dev);
2029 	return skb->len;
2030 }
2031 
2032 #ifdef CONFIG_PROC_FS
2033 static int psched_show(struct seq_file *seq, void *v)
2034 {
2035 	seq_printf(seq, "%08x %08x %08x %08x\n",
2036 		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2037 		   1000000,
2038 		   (u32)NSEC_PER_SEC / hrtimer_resolution);
2039 
2040 	return 0;
2041 }
2042 
2043 static int psched_open(struct inode *inode, struct file *file)
2044 {
2045 	return single_open(file, psched_show, NULL);
2046 }
2047 
2048 static const struct file_operations psched_fops = {
2049 	.open = psched_open,
2050 	.read  = seq_read,
2051 	.llseek = seq_lseek,
2052 	.release = single_release,
2053 };
2054 
2055 static int __net_init psched_net_init(struct net *net)
2056 {
2057 	struct proc_dir_entry *e;
2058 
2059 	e = proc_create("psched", 0, net->proc_net, &psched_fops);
2060 	if (e == NULL)
2061 		return -ENOMEM;
2062 
2063 	return 0;
2064 }
2065 
2066 static void __net_exit psched_net_exit(struct net *net)
2067 {
2068 	remove_proc_entry("psched", net->proc_net);
2069 }
2070 #else
2071 static int __net_init psched_net_init(struct net *net)
2072 {
2073 	return 0;
2074 }
2075 
2076 static void __net_exit psched_net_exit(struct net *net)
2077 {
2078 }
2079 #endif
2080 
2081 static struct pernet_operations psched_net_ops = {
2082 	.init = psched_net_init,
2083 	.exit = psched_net_exit,
2084 };
2085 
2086 static int __init pktsched_init(void)
2087 {
2088 	int err;
2089 
2090 	err = register_pernet_subsys(&psched_net_ops);
2091 	if (err) {
2092 		pr_err("pktsched_init: "
2093 		       "cannot initialize per netns operations\n");
2094 		return err;
2095 	}
2096 
2097 	register_qdisc(&pfifo_fast_ops);
2098 	register_qdisc(&pfifo_qdisc_ops);
2099 	register_qdisc(&bfifo_qdisc_ops);
2100 	register_qdisc(&pfifo_head_drop_qdisc_ops);
2101 	register_qdisc(&mq_qdisc_ops);
2102 	register_qdisc(&noqueue_qdisc_ops);
2103 
2104 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2105 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2106 	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2107 		      0);
2108 	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2109 	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2110 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2111 		      0);
2112 
2113 	return 0;
2114 }
2115 
2116 subsys_initcall(pktsched_init);
2117