xref: /linux/net/sched/sch_api.c (revision 6ebe6dbd6886af07b102aca42e44edbee94a22d9)
1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32 #include <linux/hashtable.h>
33 
34 #include <net/net_namespace.h>
35 #include <net/sock.h>
36 #include <net/netlink.h>
37 #include <net/pkt_sched.h>
38 #include <net/pkt_cls.h>
39 
40 /*
41 
42    Short review.
43    -------------
44 
45    This file consists of two interrelated parts:
46 
47    1. queueing disciplines manager frontend.
48    2. traffic classes manager frontend.
49 
50    Generally, queueing discipline ("qdisc") is a black box,
51    which is able to enqueue packets and to dequeue them (when
52    device is ready to send something) in order and at times
53    determined by algorithm hidden in it.
54 
55    qdisc's are divided to two categories:
56    - "queues", which have no internal structure visible from outside.
57    - "schedulers", which split all the packets to "traffic classes",
58      using "packet classifiers" (look at cls_api.c)
59 
60    In turn, classes may have child qdiscs (as rule, queues)
61    attached to them etc. etc. etc.
62 
63    The goal of the routines in this file is to translate
64    information supplied by user in the form of handles
65    to more intelligible for kernel form, to make some sanity
66    checks and part of work, which is common to all qdiscs
67    and to provide rtnetlink notifications.
68 
69    All real intelligent work is done inside qdisc modules.
70 
71 
72 
73    Every discipline has two major routines: enqueue and dequeue.
74 
75    ---dequeue
76 
77    dequeue usually returns a skb to send. It is allowed to return NULL,
78    but it does not mean that queue is empty, it just means that
79    discipline does not want to send anything this time.
80    Queue is really empty if q->q.qlen == 0.
81    For complicated disciplines with multiple queues q->q is not
82    real packet queue, but however q->q.qlen must be valid.
83 
84    ---enqueue
85 
86    enqueue returns 0, if packet was enqueued successfully.
87    If packet (this one or another one) was dropped, it returns
88    not zero error code.
89    NET_XMIT_DROP 	- this packet dropped
90      Expected action: do not backoff, but wait until queue will clear.
91    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
92      Expected action: backoff or ignore
93 
94    Auxiliary routines:
95 
96    ---peek
97 
98    like dequeue but without removing a packet from the queue
99 
100    ---reset
101 
102    returns qdisc to initial state: purge all buffers, clear all
103    timers, counters (except for statistics) etc.
104 
105    ---init
106 
107    initializes newly created qdisc.
108 
109    ---destroy
110 
111    destroys resources allocated by init and during lifetime of qdisc.
112 
113    ---change
114 
115    changes qdisc parameters.
116  */
117 
118 /* Protects list of registered TC modules. It is pure SMP lock. */
119 static DEFINE_RWLOCK(qdisc_mod_lock);
120 
121 
122 /************************************************
123  *	Queueing disciplines manipulation.	*
124  ************************************************/
125 
126 
127 /* The list of all installed queueing disciplines. */
128 
129 static struct Qdisc_ops *qdisc_base;
130 
131 /* Register/unregister queueing discipline */
132 
133 int register_qdisc(struct Qdisc_ops *qops)
134 {
135 	struct Qdisc_ops *q, **qp;
136 	int rc = -EEXIST;
137 
138 	write_lock(&qdisc_mod_lock);
139 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
140 		if (!strcmp(qops->id, q->id))
141 			goto out;
142 
143 	if (qops->enqueue == NULL)
144 		qops->enqueue = noop_qdisc_ops.enqueue;
145 	if (qops->peek == NULL) {
146 		if (qops->dequeue == NULL)
147 			qops->peek = noop_qdisc_ops.peek;
148 		else
149 			goto out_einval;
150 	}
151 	if (qops->dequeue == NULL)
152 		qops->dequeue = noop_qdisc_ops.dequeue;
153 
154 	if (qops->cl_ops) {
155 		const struct Qdisc_class_ops *cops = qops->cl_ops;
156 
157 		if (!(cops->find && cops->walk && cops->leaf))
158 			goto out_einval;
159 
160 		if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
161 			goto out_einval;
162 	}
163 
164 	qops->next = NULL;
165 	*qp = qops;
166 	rc = 0;
167 out:
168 	write_unlock(&qdisc_mod_lock);
169 	return rc;
170 
171 out_einval:
172 	rc = -EINVAL;
173 	goto out;
174 }
175 EXPORT_SYMBOL(register_qdisc);
176 
177 int unregister_qdisc(struct Qdisc_ops *qops)
178 {
179 	struct Qdisc_ops *q, **qp;
180 	int err = -ENOENT;
181 
182 	write_lock(&qdisc_mod_lock);
183 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
184 		if (q == qops)
185 			break;
186 	if (q) {
187 		*qp = q->next;
188 		q->next = NULL;
189 		err = 0;
190 	}
191 	write_unlock(&qdisc_mod_lock);
192 	return err;
193 }
194 EXPORT_SYMBOL(unregister_qdisc);
195 
196 /* Get default qdisc if not otherwise specified */
197 void qdisc_get_default(char *name, size_t len)
198 {
199 	read_lock(&qdisc_mod_lock);
200 	strlcpy(name, default_qdisc_ops->id, len);
201 	read_unlock(&qdisc_mod_lock);
202 }
203 
204 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
205 {
206 	struct Qdisc_ops *q = NULL;
207 
208 	for (q = qdisc_base; q; q = q->next) {
209 		if (!strcmp(name, q->id)) {
210 			if (!try_module_get(q->owner))
211 				q = NULL;
212 			break;
213 		}
214 	}
215 
216 	return q;
217 }
218 
219 /* Set new default qdisc to use */
220 int qdisc_set_default(const char *name)
221 {
222 	const struct Qdisc_ops *ops;
223 
224 	if (!capable(CAP_NET_ADMIN))
225 		return -EPERM;
226 
227 	write_lock(&qdisc_mod_lock);
228 	ops = qdisc_lookup_default(name);
229 	if (!ops) {
230 		/* Not found, drop lock and try to load module */
231 		write_unlock(&qdisc_mod_lock);
232 		request_module("sch_%s", name);
233 		write_lock(&qdisc_mod_lock);
234 
235 		ops = qdisc_lookup_default(name);
236 	}
237 
238 	if (ops) {
239 		/* Set new default */
240 		module_put(default_qdisc_ops->owner);
241 		default_qdisc_ops = ops;
242 	}
243 	write_unlock(&qdisc_mod_lock);
244 
245 	return ops ? 0 : -ENOENT;
246 }
247 
248 #ifdef CONFIG_NET_SCH_DEFAULT
249 /* Set default value from kernel config */
250 static int __init sch_default_qdisc(void)
251 {
252 	return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
253 }
254 late_initcall(sch_default_qdisc);
255 #endif
256 
257 /* We know handle. Find qdisc among all qdisc's attached to device
258  * (root qdisc, all its children, children of children etc.)
259  * Note: caller either uses rtnl or rcu_read_lock()
260  */
261 
262 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
263 {
264 	struct Qdisc *q;
265 
266 	if (!qdisc_dev(root))
267 		return (root->handle == handle ? root : NULL);
268 
269 	if (!(root->flags & TCQ_F_BUILTIN) &&
270 	    root->handle == handle)
271 		return root;
272 
273 	hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
274 		if (q->handle == handle)
275 			return q;
276 	}
277 	return NULL;
278 }
279 
280 void qdisc_hash_add(struct Qdisc *q, bool invisible)
281 {
282 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
283 		ASSERT_RTNL();
284 		hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
285 		if (invisible)
286 			q->flags |= TCQ_F_INVISIBLE;
287 	}
288 }
289 EXPORT_SYMBOL(qdisc_hash_add);
290 
291 void qdisc_hash_del(struct Qdisc *q)
292 {
293 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
294 		ASSERT_RTNL();
295 		hash_del_rcu(&q->hash);
296 	}
297 }
298 EXPORT_SYMBOL(qdisc_hash_del);
299 
300 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
301 {
302 	struct Qdisc *q;
303 
304 	if (!handle)
305 		return NULL;
306 	q = qdisc_match_from_root(dev->qdisc, handle);
307 	if (q)
308 		goto out;
309 
310 	if (dev_ingress_queue(dev))
311 		q = qdisc_match_from_root(
312 			dev_ingress_queue(dev)->qdisc_sleeping,
313 			handle);
314 out:
315 	return q;
316 }
317 
318 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
319 {
320 	unsigned long cl;
321 	struct Qdisc *leaf;
322 	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
323 
324 	if (cops == NULL)
325 		return NULL;
326 	cl = cops->find(p, classid);
327 
328 	if (cl == 0)
329 		return NULL;
330 	leaf = cops->leaf(p, cl);
331 	return leaf;
332 }
333 
334 /* Find queueing discipline by name */
335 
336 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
337 {
338 	struct Qdisc_ops *q = NULL;
339 
340 	if (kind) {
341 		read_lock(&qdisc_mod_lock);
342 		for (q = qdisc_base; q; q = q->next) {
343 			if (nla_strcmp(kind, q->id) == 0) {
344 				if (!try_module_get(q->owner))
345 					q = NULL;
346 				break;
347 			}
348 		}
349 		read_unlock(&qdisc_mod_lock);
350 	}
351 	return q;
352 }
353 
354 /* The linklayer setting were not transferred from iproute2, in older
355  * versions, and the rate tables lookup systems have been dropped in
356  * the kernel. To keep backward compatible with older iproute2 tc
357  * utils, we detect the linklayer setting by detecting if the rate
358  * table were modified.
359  *
360  * For linklayer ATM table entries, the rate table will be aligned to
361  * 48 bytes, thus some table entries will contain the same value.  The
362  * mpu (min packet unit) is also encoded into the old rate table, thus
363  * starting from the mpu, we find low and high table entries for
364  * mapping this cell.  If these entries contain the same value, when
365  * the rate tables have been modified for linklayer ATM.
366  *
367  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
368  * and then roundup to the next cell, calc the table entry one below,
369  * and compare.
370  */
371 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
372 {
373 	int low       = roundup(r->mpu, 48);
374 	int high      = roundup(low+1, 48);
375 	int cell_low  = low >> r->cell_log;
376 	int cell_high = (high >> r->cell_log) - 1;
377 
378 	/* rtab is too inaccurate at rates > 100Mbit/s */
379 	if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
380 		pr_debug("TC linklayer: Giving up ATM detection\n");
381 		return TC_LINKLAYER_ETHERNET;
382 	}
383 
384 	if ((cell_high > cell_low) && (cell_high < 256)
385 	    && (rtab[cell_low] == rtab[cell_high])) {
386 		pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
387 			 cell_low, cell_high, rtab[cell_high]);
388 		return TC_LINKLAYER_ATM;
389 	}
390 	return TC_LINKLAYER_ETHERNET;
391 }
392 
393 static struct qdisc_rate_table *qdisc_rtab_list;
394 
395 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
396 					struct nlattr *tab,
397 					struct netlink_ext_ack *extack)
398 {
399 	struct qdisc_rate_table *rtab;
400 
401 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
402 	    nla_len(tab) != TC_RTAB_SIZE) {
403 		NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
404 		return NULL;
405 	}
406 
407 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
408 		if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
409 		    !memcmp(&rtab->data, nla_data(tab), 1024)) {
410 			rtab->refcnt++;
411 			return rtab;
412 		}
413 	}
414 
415 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
416 	if (rtab) {
417 		rtab->rate = *r;
418 		rtab->refcnt = 1;
419 		memcpy(rtab->data, nla_data(tab), 1024);
420 		if (r->linklayer == TC_LINKLAYER_UNAWARE)
421 			r->linklayer = __detect_linklayer(r, rtab->data);
422 		rtab->next = qdisc_rtab_list;
423 		qdisc_rtab_list = rtab;
424 	} else {
425 		NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
426 	}
427 	return rtab;
428 }
429 EXPORT_SYMBOL(qdisc_get_rtab);
430 
431 void qdisc_put_rtab(struct qdisc_rate_table *tab)
432 {
433 	struct qdisc_rate_table *rtab, **rtabp;
434 
435 	if (!tab || --tab->refcnt)
436 		return;
437 
438 	for (rtabp = &qdisc_rtab_list;
439 	     (rtab = *rtabp) != NULL;
440 	     rtabp = &rtab->next) {
441 		if (rtab == tab) {
442 			*rtabp = rtab->next;
443 			kfree(rtab);
444 			return;
445 		}
446 	}
447 }
448 EXPORT_SYMBOL(qdisc_put_rtab);
449 
450 static LIST_HEAD(qdisc_stab_list);
451 
452 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
453 	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
454 	[TCA_STAB_DATA] = { .type = NLA_BINARY },
455 };
456 
457 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
458 					       struct netlink_ext_ack *extack)
459 {
460 	struct nlattr *tb[TCA_STAB_MAX + 1];
461 	struct qdisc_size_table *stab;
462 	struct tc_sizespec *s;
463 	unsigned int tsize = 0;
464 	u16 *tab = NULL;
465 	int err;
466 
467 	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy, extack);
468 	if (err < 0)
469 		return ERR_PTR(err);
470 	if (!tb[TCA_STAB_BASE]) {
471 		NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
472 		return ERR_PTR(-EINVAL);
473 	}
474 
475 	s = nla_data(tb[TCA_STAB_BASE]);
476 
477 	if (s->tsize > 0) {
478 		if (!tb[TCA_STAB_DATA]) {
479 			NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
480 			return ERR_PTR(-EINVAL);
481 		}
482 		tab = nla_data(tb[TCA_STAB_DATA]);
483 		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
484 	}
485 
486 	if (tsize != s->tsize || (!tab && tsize > 0)) {
487 		NL_SET_ERR_MSG(extack, "Invalid size of size table");
488 		return ERR_PTR(-EINVAL);
489 	}
490 
491 	list_for_each_entry(stab, &qdisc_stab_list, list) {
492 		if (memcmp(&stab->szopts, s, sizeof(*s)))
493 			continue;
494 		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
495 			continue;
496 		stab->refcnt++;
497 		return stab;
498 	}
499 
500 	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
501 	if (!stab)
502 		return ERR_PTR(-ENOMEM);
503 
504 	stab->refcnt = 1;
505 	stab->szopts = *s;
506 	if (tsize > 0)
507 		memcpy(stab->data, tab, tsize * sizeof(u16));
508 
509 	list_add_tail(&stab->list, &qdisc_stab_list);
510 
511 	return stab;
512 }
513 
514 static void stab_kfree_rcu(struct rcu_head *head)
515 {
516 	kfree(container_of(head, struct qdisc_size_table, rcu));
517 }
518 
519 void qdisc_put_stab(struct qdisc_size_table *tab)
520 {
521 	if (!tab)
522 		return;
523 
524 	if (--tab->refcnt == 0) {
525 		list_del(&tab->list);
526 		call_rcu_bh(&tab->rcu, stab_kfree_rcu);
527 	}
528 }
529 EXPORT_SYMBOL(qdisc_put_stab);
530 
531 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
532 {
533 	struct nlattr *nest;
534 
535 	nest = nla_nest_start(skb, TCA_STAB);
536 	if (nest == NULL)
537 		goto nla_put_failure;
538 	if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
539 		goto nla_put_failure;
540 	nla_nest_end(skb, nest);
541 
542 	return skb->len;
543 
544 nla_put_failure:
545 	return -1;
546 }
547 
548 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
549 			       const struct qdisc_size_table *stab)
550 {
551 	int pkt_len, slot;
552 
553 	pkt_len = skb->len + stab->szopts.overhead;
554 	if (unlikely(!stab->szopts.tsize))
555 		goto out;
556 
557 	slot = pkt_len + stab->szopts.cell_align;
558 	if (unlikely(slot < 0))
559 		slot = 0;
560 
561 	slot >>= stab->szopts.cell_log;
562 	if (likely(slot < stab->szopts.tsize))
563 		pkt_len = stab->data[slot];
564 	else
565 		pkt_len = stab->data[stab->szopts.tsize - 1] *
566 				(slot / stab->szopts.tsize) +
567 				stab->data[slot % stab->szopts.tsize];
568 
569 	pkt_len <<= stab->szopts.size_log;
570 out:
571 	if (unlikely(pkt_len < 1))
572 		pkt_len = 1;
573 	qdisc_skb_cb(skb)->pkt_len = pkt_len;
574 }
575 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
576 
577 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
578 {
579 	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
580 		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
581 			txt, qdisc->ops->id, qdisc->handle >> 16);
582 		qdisc->flags |= TCQ_F_WARN_NONWC;
583 	}
584 }
585 EXPORT_SYMBOL(qdisc_warn_nonwc);
586 
587 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
588 {
589 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
590 						 timer);
591 
592 	rcu_read_lock();
593 	__netif_schedule(qdisc_root(wd->qdisc));
594 	rcu_read_unlock();
595 
596 	return HRTIMER_NORESTART;
597 }
598 
599 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
600 {
601 	hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
602 	wd->timer.function = qdisc_watchdog;
603 	wd->qdisc = qdisc;
604 }
605 EXPORT_SYMBOL(qdisc_watchdog_init);
606 
607 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
608 {
609 	if (test_bit(__QDISC_STATE_DEACTIVATED,
610 		     &qdisc_root_sleeping(wd->qdisc)->state))
611 		return;
612 
613 	if (wd->last_expires == expires)
614 		return;
615 
616 	wd->last_expires = expires;
617 	hrtimer_start(&wd->timer,
618 		      ns_to_ktime(expires),
619 		      HRTIMER_MODE_ABS_PINNED);
620 }
621 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
622 
623 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
624 {
625 	hrtimer_cancel(&wd->timer);
626 }
627 EXPORT_SYMBOL(qdisc_watchdog_cancel);
628 
629 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
630 {
631 	struct hlist_head *h;
632 	unsigned int i;
633 
634 	h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
635 
636 	if (h != NULL) {
637 		for (i = 0; i < n; i++)
638 			INIT_HLIST_HEAD(&h[i]);
639 	}
640 	return h;
641 }
642 
643 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
644 {
645 	struct Qdisc_class_common *cl;
646 	struct hlist_node *next;
647 	struct hlist_head *nhash, *ohash;
648 	unsigned int nsize, nmask, osize;
649 	unsigned int i, h;
650 
651 	/* Rehash when load factor exceeds 0.75 */
652 	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
653 		return;
654 	nsize = clhash->hashsize * 2;
655 	nmask = nsize - 1;
656 	nhash = qdisc_class_hash_alloc(nsize);
657 	if (nhash == NULL)
658 		return;
659 
660 	ohash = clhash->hash;
661 	osize = clhash->hashsize;
662 
663 	sch_tree_lock(sch);
664 	for (i = 0; i < osize; i++) {
665 		hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
666 			h = qdisc_class_hash(cl->classid, nmask);
667 			hlist_add_head(&cl->hnode, &nhash[h]);
668 		}
669 	}
670 	clhash->hash     = nhash;
671 	clhash->hashsize = nsize;
672 	clhash->hashmask = nmask;
673 	sch_tree_unlock(sch);
674 
675 	kvfree(ohash);
676 }
677 EXPORT_SYMBOL(qdisc_class_hash_grow);
678 
679 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
680 {
681 	unsigned int size = 4;
682 
683 	clhash->hash = qdisc_class_hash_alloc(size);
684 	if (!clhash->hash)
685 		return -ENOMEM;
686 	clhash->hashsize  = size;
687 	clhash->hashmask  = size - 1;
688 	clhash->hashelems = 0;
689 	return 0;
690 }
691 EXPORT_SYMBOL(qdisc_class_hash_init);
692 
693 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
694 {
695 	kvfree(clhash->hash);
696 }
697 EXPORT_SYMBOL(qdisc_class_hash_destroy);
698 
699 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
700 			     struct Qdisc_class_common *cl)
701 {
702 	unsigned int h;
703 
704 	INIT_HLIST_NODE(&cl->hnode);
705 	h = qdisc_class_hash(cl->classid, clhash->hashmask);
706 	hlist_add_head(&cl->hnode, &clhash->hash[h]);
707 	clhash->hashelems++;
708 }
709 EXPORT_SYMBOL(qdisc_class_hash_insert);
710 
711 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
712 			     struct Qdisc_class_common *cl)
713 {
714 	hlist_del(&cl->hnode);
715 	clhash->hashelems--;
716 }
717 EXPORT_SYMBOL(qdisc_class_hash_remove);
718 
719 /* Allocate an unique handle from space managed by kernel
720  * Possible range is [8000-FFFF]:0000 (0x8000 values)
721  */
722 static u32 qdisc_alloc_handle(struct net_device *dev)
723 {
724 	int i = 0x8000;
725 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
726 
727 	do {
728 		autohandle += TC_H_MAKE(0x10000U, 0);
729 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
730 			autohandle = TC_H_MAKE(0x80000000U, 0);
731 		if (!qdisc_lookup(dev, autohandle))
732 			return autohandle;
733 		cond_resched();
734 	} while	(--i > 0);
735 
736 	return 0;
737 }
738 
739 void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
740 			       unsigned int len)
741 {
742 	const struct Qdisc_class_ops *cops;
743 	unsigned long cl;
744 	u32 parentid;
745 	bool notify;
746 	int drops;
747 
748 	if (n == 0 && len == 0)
749 		return;
750 	drops = max_t(int, n, 0);
751 	rcu_read_lock();
752 	while ((parentid = sch->parent)) {
753 		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
754 			break;
755 
756 		if (sch->flags & TCQ_F_NOPARENT)
757 			break;
758 		/* Notify parent qdisc only if child qdisc becomes empty.
759 		 *
760 		 * If child was empty even before update then backlog
761 		 * counter is screwed and we skip notification because
762 		 * parent class is already passive.
763 		 */
764 		notify = !sch->q.qlen && !WARN_ON_ONCE(!n);
765 		/* TODO: perform the search on a per txq basis */
766 		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
767 		if (sch == NULL) {
768 			WARN_ON_ONCE(parentid != TC_H_ROOT);
769 			break;
770 		}
771 		cops = sch->ops->cl_ops;
772 		if (notify && cops->qlen_notify) {
773 			cl = cops->find(sch, parentid);
774 			cops->qlen_notify(sch, cl);
775 		}
776 		sch->q.qlen -= n;
777 		sch->qstats.backlog -= len;
778 		__qdisc_qstats_drop(sch, drops);
779 	}
780 	rcu_read_unlock();
781 }
782 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
783 
784 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
785 			 u32 portid, u32 seq, u16 flags, int event)
786 {
787 	struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
788 	struct gnet_stats_queue __percpu *cpu_qstats = NULL;
789 	struct tcmsg *tcm;
790 	struct nlmsghdr  *nlh;
791 	unsigned char *b = skb_tail_pointer(skb);
792 	struct gnet_dump d;
793 	struct qdisc_size_table *stab;
794 	__u32 qlen;
795 
796 	cond_resched();
797 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
798 	if (!nlh)
799 		goto out_nlmsg_trim;
800 	tcm = nlmsg_data(nlh);
801 	tcm->tcm_family = AF_UNSPEC;
802 	tcm->tcm__pad1 = 0;
803 	tcm->tcm__pad2 = 0;
804 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
805 	tcm->tcm_parent = clid;
806 	tcm->tcm_handle = q->handle;
807 	tcm->tcm_info = refcount_read(&q->refcnt);
808 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
809 		goto nla_put_failure;
810 	if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
811 		goto nla_put_failure;
812 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
813 		goto nla_put_failure;
814 
815 	qlen = qdisc_qlen_sum(q);
816 
817 	stab = rtnl_dereference(q->stab);
818 	if (stab && qdisc_dump_stab(skb, stab) < 0)
819 		goto nla_put_failure;
820 
821 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
822 					 NULL, &d, TCA_PAD) < 0)
823 		goto nla_put_failure;
824 
825 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
826 		goto nla_put_failure;
827 
828 	if (qdisc_is_percpu_stats(q)) {
829 		cpu_bstats = q->cpu_bstats;
830 		cpu_qstats = q->cpu_qstats;
831 	}
832 
833 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
834 				  &d, cpu_bstats, &q->bstats) < 0 ||
835 	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
836 	    gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
837 		goto nla_put_failure;
838 
839 	if (gnet_stats_finish_copy(&d) < 0)
840 		goto nla_put_failure;
841 
842 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
843 	return skb->len;
844 
845 out_nlmsg_trim:
846 nla_put_failure:
847 	nlmsg_trim(skb, b);
848 	return -1;
849 }
850 
851 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
852 {
853 	if (q->flags & TCQ_F_BUILTIN)
854 		return true;
855 	if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
856 		return true;
857 
858 	return false;
859 }
860 
861 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
862 			struct nlmsghdr *n, u32 clid,
863 			struct Qdisc *old, struct Qdisc *new)
864 {
865 	struct sk_buff *skb;
866 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
867 
868 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
869 	if (!skb)
870 		return -ENOBUFS;
871 
872 	if (old && !tc_qdisc_dump_ignore(old, false)) {
873 		if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
874 				  0, RTM_DELQDISC) < 0)
875 			goto err_out;
876 	}
877 	if (new && !tc_qdisc_dump_ignore(new, false)) {
878 		if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
879 				  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
880 			goto err_out;
881 	}
882 
883 	if (skb->len)
884 		return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
885 				      n->nlmsg_flags & NLM_F_ECHO);
886 
887 err_out:
888 	kfree_skb(skb);
889 	return -EINVAL;
890 }
891 
892 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
893 			       struct nlmsghdr *n, u32 clid,
894 			       struct Qdisc *old, struct Qdisc *new)
895 {
896 	if (new || old)
897 		qdisc_notify(net, skb, n, clid, old, new);
898 
899 	if (old)
900 		qdisc_destroy(old);
901 }
902 
903 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
904  * to device "dev".
905  *
906  * When appropriate send a netlink notification using 'skb'
907  * and "n".
908  *
909  * On success, destroy old qdisc.
910  */
911 
912 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
913 		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
914 		       struct Qdisc *new, struct Qdisc *old,
915 		       struct netlink_ext_ack *extack)
916 {
917 	struct Qdisc *q = old;
918 	struct net *net = dev_net(dev);
919 	int err = 0;
920 
921 	if (parent == NULL) {
922 		unsigned int i, num_q, ingress;
923 
924 		ingress = 0;
925 		num_q = dev->num_tx_queues;
926 		if ((q && q->flags & TCQ_F_INGRESS) ||
927 		    (new && new->flags & TCQ_F_INGRESS)) {
928 			num_q = 1;
929 			ingress = 1;
930 			if (!dev_ingress_queue(dev)) {
931 				NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
932 				return -ENOENT;
933 			}
934 		}
935 
936 		if (dev->flags & IFF_UP)
937 			dev_deactivate(dev);
938 
939 		if (new && new->ops->attach)
940 			goto skip;
941 
942 		for (i = 0; i < num_q; i++) {
943 			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
944 
945 			if (!ingress)
946 				dev_queue = netdev_get_tx_queue(dev, i);
947 
948 			old = dev_graft_qdisc(dev_queue, new);
949 			if (new && i > 0)
950 				qdisc_refcount_inc(new);
951 
952 			if (!ingress)
953 				qdisc_destroy(old);
954 		}
955 
956 skip:
957 		if (!ingress) {
958 			notify_and_destroy(net, skb, n, classid,
959 					   dev->qdisc, new);
960 			if (new && !new->ops->attach)
961 				qdisc_refcount_inc(new);
962 			dev->qdisc = new ? : &noop_qdisc;
963 
964 			if (new && new->ops->attach)
965 				new->ops->attach(new);
966 		} else {
967 			notify_and_destroy(net, skb, n, classid, old, new);
968 		}
969 
970 		if (dev->flags & IFF_UP)
971 			dev_activate(dev);
972 	} else {
973 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
974 
975 		/* Only support running class lockless if parent is lockless */
976 		if (new && (new->flags & TCQ_F_NOLOCK) &&
977 		    parent && !(parent->flags & TCQ_F_NOLOCK))
978 			new->flags &= ~TCQ_F_NOLOCK;
979 
980 		err = -EOPNOTSUPP;
981 		if (cops && cops->graft) {
982 			unsigned long cl = cops->find(parent, classid);
983 
984 			if (cl) {
985 				err = cops->graft(parent, cl, new, &old,
986 						  extack);
987 			} else {
988 				NL_SET_ERR_MSG(extack, "Specified class not found");
989 				err = -ENOENT;
990 			}
991 		}
992 		if (!err)
993 			notify_and_destroy(net, skb, n, classid, old, new);
994 	}
995 	return err;
996 }
997 
998 /* lockdep annotation is needed for ingress; egress gets it only for name */
999 static struct lock_class_key qdisc_tx_lock;
1000 static struct lock_class_key qdisc_rx_lock;
1001 
1002 /*
1003    Allocate and initialize new qdisc.
1004 
1005    Parameters are passed via opt.
1006  */
1007 
1008 static struct Qdisc *qdisc_create(struct net_device *dev,
1009 				  struct netdev_queue *dev_queue,
1010 				  struct Qdisc *p, u32 parent, u32 handle,
1011 				  struct nlattr **tca, int *errp,
1012 				  struct netlink_ext_ack *extack)
1013 {
1014 	int err;
1015 	struct nlattr *kind = tca[TCA_KIND];
1016 	struct Qdisc *sch;
1017 	struct Qdisc_ops *ops;
1018 	struct qdisc_size_table *stab;
1019 
1020 	ops = qdisc_lookup_ops(kind);
1021 #ifdef CONFIG_MODULES
1022 	if (ops == NULL && kind != NULL) {
1023 		char name[IFNAMSIZ];
1024 		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1025 			/* We dropped the RTNL semaphore in order to
1026 			 * perform the module load.  So, even if we
1027 			 * succeeded in loading the module we have to
1028 			 * tell the caller to replay the request.  We
1029 			 * indicate this using -EAGAIN.
1030 			 * We replay the request because the device may
1031 			 * go away in the mean time.
1032 			 */
1033 			rtnl_unlock();
1034 			request_module("sch_%s", name);
1035 			rtnl_lock();
1036 			ops = qdisc_lookup_ops(kind);
1037 			if (ops != NULL) {
1038 				/* We will try again qdisc_lookup_ops,
1039 				 * so don't keep a reference.
1040 				 */
1041 				module_put(ops->owner);
1042 				err = -EAGAIN;
1043 				goto err_out;
1044 			}
1045 		}
1046 	}
1047 #endif
1048 
1049 	err = -ENOENT;
1050 	if (!ops) {
1051 		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1052 		goto err_out;
1053 	}
1054 
1055 	sch = qdisc_alloc(dev_queue, ops, extack);
1056 	if (IS_ERR(sch)) {
1057 		err = PTR_ERR(sch);
1058 		goto err_out2;
1059 	}
1060 
1061 	sch->parent = parent;
1062 
1063 	if (handle == TC_H_INGRESS) {
1064 		sch->flags |= TCQ_F_INGRESS;
1065 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
1066 		lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
1067 	} else {
1068 		if (handle == 0) {
1069 			handle = qdisc_alloc_handle(dev);
1070 			err = -ENOMEM;
1071 			if (handle == 0)
1072 				goto err_out3;
1073 		}
1074 		lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
1075 		if (!netif_is_multiqueue(dev))
1076 			sch->flags |= TCQ_F_ONETXQUEUE;
1077 	}
1078 
1079 	sch->handle = handle;
1080 
1081 	/* This exist to keep backward compatible with a userspace
1082 	 * loophole, what allowed userspace to get IFF_NO_QUEUE
1083 	 * facility on older kernels by setting tx_queue_len=0 (prior
1084 	 * to qdisc init), and then forgot to reinit tx_queue_len
1085 	 * before again attaching a qdisc.
1086 	 */
1087 	if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1088 		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1089 		netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1090 	}
1091 
1092 	if (ops->init) {
1093 		err = ops->init(sch, tca[TCA_OPTIONS], extack);
1094 		if (err != 0)
1095 			goto err_out5;
1096 	}
1097 
1098 	if (qdisc_is_percpu_stats(sch)) {
1099 		sch->cpu_bstats =
1100 			netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
1101 		if (!sch->cpu_bstats)
1102 			goto err_out4;
1103 
1104 		sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
1105 		if (!sch->cpu_qstats)
1106 			goto err_out4;
1107 	}
1108 
1109 	if (tca[TCA_STAB]) {
1110 		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1111 		if (IS_ERR(stab)) {
1112 			err = PTR_ERR(stab);
1113 			goto err_out4;
1114 		}
1115 		rcu_assign_pointer(sch->stab, stab);
1116 	}
1117 	if (tca[TCA_RATE]) {
1118 		seqcount_t *running;
1119 
1120 		err = -EOPNOTSUPP;
1121 		if (sch->flags & TCQ_F_MQROOT) {
1122 			NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1123 			goto err_out4;
1124 		}
1125 
1126 		if (sch->parent != TC_H_ROOT &&
1127 		    !(sch->flags & TCQ_F_INGRESS) &&
1128 		    (!p || !(p->flags & TCQ_F_MQROOT)))
1129 			running = qdisc_root_sleeping_running(sch);
1130 		else
1131 			running = &sch->running;
1132 
1133 		err = gen_new_estimator(&sch->bstats,
1134 					sch->cpu_bstats,
1135 					&sch->rate_est,
1136 					NULL,
1137 					running,
1138 					tca[TCA_RATE]);
1139 		if (err) {
1140 			NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1141 			goto err_out4;
1142 		}
1143 	}
1144 
1145 	qdisc_hash_add(sch, false);
1146 
1147 	return sch;
1148 
1149 err_out5:
1150 	/* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1151 	if (ops->destroy)
1152 		ops->destroy(sch);
1153 err_out3:
1154 	dev_put(dev);
1155 	kfree((char *) sch - sch->padded);
1156 err_out2:
1157 	module_put(ops->owner);
1158 err_out:
1159 	*errp = err;
1160 	return NULL;
1161 
1162 err_out4:
1163 	free_percpu(sch->cpu_bstats);
1164 	free_percpu(sch->cpu_qstats);
1165 	/*
1166 	 * Any broken qdiscs that would require a ops->reset() here?
1167 	 * The qdisc was never in action so it shouldn't be necessary.
1168 	 */
1169 	qdisc_put_stab(rtnl_dereference(sch->stab));
1170 	if (ops->destroy)
1171 		ops->destroy(sch);
1172 	goto err_out3;
1173 }
1174 
1175 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1176 			struct netlink_ext_ack *extack)
1177 {
1178 	struct qdisc_size_table *ostab, *stab = NULL;
1179 	int err = 0;
1180 
1181 	if (tca[TCA_OPTIONS]) {
1182 		if (!sch->ops->change) {
1183 			NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1184 			return -EINVAL;
1185 		}
1186 		err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1187 		if (err)
1188 			return err;
1189 	}
1190 
1191 	if (tca[TCA_STAB]) {
1192 		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1193 		if (IS_ERR(stab))
1194 			return PTR_ERR(stab);
1195 	}
1196 
1197 	ostab = rtnl_dereference(sch->stab);
1198 	rcu_assign_pointer(sch->stab, stab);
1199 	qdisc_put_stab(ostab);
1200 
1201 	if (tca[TCA_RATE]) {
1202 		/* NB: ignores errors from replace_estimator
1203 		   because change can't be undone. */
1204 		if (sch->flags & TCQ_F_MQROOT)
1205 			goto out;
1206 		gen_replace_estimator(&sch->bstats,
1207 				      sch->cpu_bstats,
1208 				      &sch->rate_est,
1209 				      NULL,
1210 				      qdisc_root_sleeping_running(sch),
1211 				      tca[TCA_RATE]);
1212 	}
1213 out:
1214 	return 0;
1215 }
1216 
1217 struct check_loop_arg {
1218 	struct qdisc_walker	w;
1219 	struct Qdisc		*p;
1220 	int			depth;
1221 };
1222 
1223 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1224 			 struct qdisc_walker *w);
1225 
1226 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1227 {
1228 	struct check_loop_arg	arg;
1229 
1230 	if (q->ops->cl_ops == NULL)
1231 		return 0;
1232 
1233 	arg.w.stop = arg.w.skip = arg.w.count = 0;
1234 	arg.w.fn = check_loop_fn;
1235 	arg.depth = depth;
1236 	arg.p = p;
1237 	q->ops->cl_ops->walk(q, &arg.w);
1238 	return arg.w.stop ? -ELOOP : 0;
1239 }
1240 
1241 static int
1242 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1243 {
1244 	struct Qdisc *leaf;
1245 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1246 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
1247 
1248 	leaf = cops->leaf(q, cl);
1249 	if (leaf) {
1250 		if (leaf == arg->p || arg->depth > 7)
1251 			return -ELOOP;
1252 		return check_loop(leaf, arg->p, arg->depth + 1);
1253 	}
1254 	return 0;
1255 }
1256 
1257 /*
1258  * Delete/get qdisc.
1259  */
1260 
1261 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1262 			struct netlink_ext_ack *extack)
1263 {
1264 	struct net *net = sock_net(skb->sk);
1265 	struct tcmsg *tcm = nlmsg_data(n);
1266 	struct nlattr *tca[TCA_MAX + 1];
1267 	struct net_device *dev;
1268 	u32 clid;
1269 	struct Qdisc *q = NULL;
1270 	struct Qdisc *p = NULL;
1271 	int err;
1272 
1273 	if ((n->nlmsg_type != RTM_GETQDISC) &&
1274 	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1275 		return -EPERM;
1276 
1277 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
1278 	if (err < 0)
1279 		return err;
1280 
1281 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1282 	if (!dev)
1283 		return -ENODEV;
1284 
1285 	clid = tcm->tcm_parent;
1286 	if (clid) {
1287 		if (clid != TC_H_ROOT) {
1288 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1289 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1290 				if (!p) {
1291 					NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1292 					return -ENOENT;
1293 				}
1294 				q = qdisc_leaf(p, clid);
1295 			} else if (dev_ingress_queue(dev)) {
1296 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1297 			}
1298 		} else {
1299 			q = dev->qdisc;
1300 		}
1301 		if (!q) {
1302 			NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1303 			return -ENOENT;
1304 		}
1305 
1306 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1307 			NL_SET_ERR_MSG(extack, "Invalid handle");
1308 			return -EINVAL;
1309 		}
1310 	} else {
1311 		q = qdisc_lookup(dev, tcm->tcm_handle);
1312 		if (!q) {
1313 			NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1314 			return -ENOENT;
1315 		}
1316 	}
1317 
1318 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1319 		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1320 		return -EINVAL;
1321 	}
1322 
1323 	if (n->nlmsg_type == RTM_DELQDISC) {
1324 		if (!clid) {
1325 			NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1326 			return -EINVAL;
1327 		}
1328 		if (q->handle == 0) {
1329 			NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1330 			return -ENOENT;
1331 		}
1332 		err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1333 		if (err != 0)
1334 			return err;
1335 	} else {
1336 		qdisc_notify(net, skb, n, clid, NULL, q);
1337 	}
1338 	return 0;
1339 }
1340 
1341 /*
1342  * Create/change qdisc.
1343  */
1344 
1345 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1346 			   struct netlink_ext_ack *extack)
1347 {
1348 	struct net *net = sock_net(skb->sk);
1349 	struct tcmsg *tcm;
1350 	struct nlattr *tca[TCA_MAX + 1];
1351 	struct net_device *dev;
1352 	u32 clid;
1353 	struct Qdisc *q, *p;
1354 	int err;
1355 
1356 	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1357 		return -EPERM;
1358 
1359 replay:
1360 	/* Reinit, just in case something touches this. */
1361 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
1362 	if (err < 0)
1363 		return err;
1364 
1365 	tcm = nlmsg_data(n);
1366 	clid = tcm->tcm_parent;
1367 	q = p = NULL;
1368 
1369 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1370 	if (!dev)
1371 		return -ENODEV;
1372 
1373 
1374 	if (clid) {
1375 		if (clid != TC_H_ROOT) {
1376 			if (clid != TC_H_INGRESS) {
1377 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1378 				if (!p) {
1379 					NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1380 					return -ENOENT;
1381 				}
1382 				q = qdisc_leaf(p, clid);
1383 			} else if (dev_ingress_queue_create(dev)) {
1384 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1385 			}
1386 		} else {
1387 			q = dev->qdisc;
1388 		}
1389 
1390 		/* It may be default qdisc, ignore it */
1391 		if (q && q->handle == 0)
1392 			q = NULL;
1393 
1394 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1395 			if (tcm->tcm_handle) {
1396 				if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1397 					NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1398 					return -EEXIST;
1399 				}
1400 				if (TC_H_MIN(tcm->tcm_handle)) {
1401 					NL_SET_ERR_MSG(extack, "Invalid minor handle");
1402 					return -EINVAL;
1403 				}
1404 				q = qdisc_lookup(dev, tcm->tcm_handle);
1405 				if (!q)
1406 					goto create_n_graft;
1407 				if (n->nlmsg_flags & NLM_F_EXCL) {
1408 					NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1409 					return -EEXIST;
1410 				}
1411 				if (tca[TCA_KIND] &&
1412 				    nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1413 					NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1414 					return -EINVAL;
1415 				}
1416 				if (q == p ||
1417 				    (p && check_loop(q, p, 0))) {
1418 					NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1419 					return -ELOOP;
1420 				}
1421 				qdisc_refcount_inc(q);
1422 				goto graft;
1423 			} else {
1424 				if (!q)
1425 					goto create_n_graft;
1426 
1427 				/* This magic test requires explanation.
1428 				 *
1429 				 *   We know, that some child q is already
1430 				 *   attached to this parent and have choice:
1431 				 *   either to change it or to create/graft new one.
1432 				 *
1433 				 *   1. We are allowed to create/graft only
1434 				 *   if CREATE and REPLACE flags are set.
1435 				 *
1436 				 *   2. If EXCL is set, requestor wanted to say,
1437 				 *   that qdisc tcm_handle is not expected
1438 				 *   to exist, so that we choose create/graft too.
1439 				 *
1440 				 *   3. The last case is when no flags are set.
1441 				 *   Alas, it is sort of hole in API, we
1442 				 *   cannot decide what to do unambiguously.
1443 				 *   For now we select create/graft, if
1444 				 *   user gave KIND, which does not match existing.
1445 				 */
1446 				if ((n->nlmsg_flags & NLM_F_CREATE) &&
1447 				    (n->nlmsg_flags & NLM_F_REPLACE) &&
1448 				    ((n->nlmsg_flags & NLM_F_EXCL) ||
1449 				     (tca[TCA_KIND] &&
1450 				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1451 					goto create_n_graft;
1452 			}
1453 		}
1454 	} else {
1455 		if (!tcm->tcm_handle) {
1456 			NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1457 			return -EINVAL;
1458 		}
1459 		q = qdisc_lookup(dev, tcm->tcm_handle);
1460 	}
1461 
1462 	/* Change qdisc parameters */
1463 	if (!q) {
1464 		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1465 		return -ENOENT;
1466 	}
1467 	if (n->nlmsg_flags & NLM_F_EXCL) {
1468 		NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1469 		return -EEXIST;
1470 	}
1471 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1472 		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1473 		return -EINVAL;
1474 	}
1475 	err = qdisc_change(q, tca, extack);
1476 	if (err == 0)
1477 		qdisc_notify(net, skb, n, clid, NULL, q);
1478 	return err;
1479 
1480 create_n_graft:
1481 	if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1482 		NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1483 		return -ENOENT;
1484 	}
1485 	if (clid == TC_H_INGRESS) {
1486 		if (dev_ingress_queue(dev)) {
1487 			q = qdisc_create(dev, dev_ingress_queue(dev), p,
1488 					 tcm->tcm_parent, tcm->tcm_parent,
1489 					 tca, &err, extack);
1490 		} else {
1491 			NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1492 			err = -ENOENT;
1493 		}
1494 	} else {
1495 		struct netdev_queue *dev_queue;
1496 
1497 		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1498 			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1499 		else if (p)
1500 			dev_queue = p->dev_queue;
1501 		else
1502 			dev_queue = netdev_get_tx_queue(dev, 0);
1503 
1504 		q = qdisc_create(dev, dev_queue, p,
1505 				 tcm->tcm_parent, tcm->tcm_handle,
1506 				 tca, &err, extack);
1507 	}
1508 	if (q == NULL) {
1509 		if (err == -EAGAIN)
1510 			goto replay;
1511 		return err;
1512 	}
1513 
1514 graft:
1515 	err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1516 	if (err) {
1517 		if (q)
1518 			qdisc_destroy(q);
1519 		return err;
1520 	}
1521 
1522 	return 0;
1523 }
1524 
1525 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1526 			      struct netlink_callback *cb,
1527 			      int *q_idx_p, int s_q_idx, bool recur,
1528 			      bool dump_invisible)
1529 {
1530 	int ret = 0, q_idx = *q_idx_p;
1531 	struct Qdisc *q;
1532 	int b;
1533 
1534 	if (!root)
1535 		return 0;
1536 
1537 	q = root;
1538 	if (q_idx < s_q_idx) {
1539 		q_idx++;
1540 	} else {
1541 		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1542 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1543 				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1544 				  RTM_NEWQDISC) <= 0)
1545 			goto done;
1546 		q_idx++;
1547 	}
1548 
1549 	/* If dumping singletons, there is no qdisc_dev(root) and the singleton
1550 	 * itself has already been dumped.
1551 	 *
1552 	 * If we've already dumped the top-level (ingress) qdisc above and the global
1553 	 * qdisc hashtable, we don't want to hit it again
1554 	 */
1555 	if (!qdisc_dev(root) || !recur)
1556 		goto out;
1557 
1558 	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1559 		if (q_idx < s_q_idx) {
1560 			q_idx++;
1561 			continue;
1562 		}
1563 		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1564 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1565 				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1566 				  RTM_NEWQDISC) <= 0)
1567 			goto done;
1568 		q_idx++;
1569 	}
1570 
1571 out:
1572 	*q_idx_p = q_idx;
1573 	return ret;
1574 done:
1575 	ret = -1;
1576 	goto out;
1577 }
1578 
1579 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1580 {
1581 	struct net *net = sock_net(skb->sk);
1582 	int idx, q_idx;
1583 	int s_idx, s_q_idx;
1584 	struct net_device *dev;
1585 	const struct nlmsghdr *nlh = cb->nlh;
1586 	struct nlattr *tca[TCA_MAX + 1];
1587 	int err;
1588 
1589 	s_idx = cb->args[0];
1590 	s_q_idx = q_idx = cb->args[1];
1591 
1592 	idx = 0;
1593 	ASSERT_RTNL();
1594 
1595 	err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX, NULL, NULL);
1596 	if (err < 0)
1597 		return err;
1598 
1599 	for_each_netdev(net, dev) {
1600 		struct netdev_queue *dev_queue;
1601 
1602 		if (idx < s_idx)
1603 			goto cont;
1604 		if (idx > s_idx)
1605 			s_q_idx = 0;
1606 		q_idx = 0;
1607 
1608 		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1609 				       true, tca[TCA_DUMP_INVISIBLE]) < 0)
1610 			goto done;
1611 
1612 		dev_queue = dev_ingress_queue(dev);
1613 		if (dev_queue &&
1614 		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1615 				       &q_idx, s_q_idx, false,
1616 				       tca[TCA_DUMP_INVISIBLE]) < 0)
1617 			goto done;
1618 
1619 cont:
1620 		idx++;
1621 	}
1622 
1623 done:
1624 	cb->args[0] = idx;
1625 	cb->args[1] = q_idx;
1626 
1627 	return skb->len;
1628 }
1629 
1630 
1631 
1632 /************************************************
1633  *	Traffic classes manipulation.		*
1634  ************************************************/
1635 
1636 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1637 			  unsigned long cl,
1638 			  u32 portid, u32 seq, u16 flags, int event)
1639 {
1640 	struct tcmsg *tcm;
1641 	struct nlmsghdr  *nlh;
1642 	unsigned char *b = skb_tail_pointer(skb);
1643 	struct gnet_dump d;
1644 	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1645 
1646 	cond_resched();
1647 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1648 	if (!nlh)
1649 		goto out_nlmsg_trim;
1650 	tcm = nlmsg_data(nlh);
1651 	tcm->tcm_family = AF_UNSPEC;
1652 	tcm->tcm__pad1 = 0;
1653 	tcm->tcm__pad2 = 0;
1654 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1655 	tcm->tcm_parent = q->handle;
1656 	tcm->tcm_handle = q->handle;
1657 	tcm->tcm_info = 0;
1658 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
1659 		goto nla_put_failure;
1660 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1661 		goto nla_put_failure;
1662 
1663 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1664 					 NULL, &d, TCA_PAD) < 0)
1665 		goto nla_put_failure;
1666 
1667 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1668 		goto nla_put_failure;
1669 
1670 	if (gnet_stats_finish_copy(&d) < 0)
1671 		goto nla_put_failure;
1672 
1673 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1674 	return skb->len;
1675 
1676 out_nlmsg_trim:
1677 nla_put_failure:
1678 	nlmsg_trim(skb, b);
1679 	return -1;
1680 }
1681 
1682 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1683 			 struct nlmsghdr *n, struct Qdisc *q,
1684 			 unsigned long cl, int event)
1685 {
1686 	struct sk_buff *skb;
1687 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1688 
1689 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1690 	if (!skb)
1691 		return -ENOBUFS;
1692 
1693 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1694 		kfree_skb(skb);
1695 		return -EINVAL;
1696 	}
1697 
1698 	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1699 			      n->nlmsg_flags & NLM_F_ECHO);
1700 }
1701 
1702 static int tclass_del_notify(struct net *net,
1703 			     const struct Qdisc_class_ops *cops,
1704 			     struct sk_buff *oskb, struct nlmsghdr *n,
1705 			     struct Qdisc *q, unsigned long cl)
1706 {
1707 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1708 	struct sk_buff *skb;
1709 	int err = 0;
1710 
1711 	if (!cops->delete)
1712 		return -EOPNOTSUPP;
1713 
1714 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1715 	if (!skb)
1716 		return -ENOBUFS;
1717 
1718 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1719 			   RTM_DELTCLASS) < 0) {
1720 		kfree_skb(skb);
1721 		return -EINVAL;
1722 	}
1723 
1724 	err = cops->delete(q, cl);
1725 	if (err) {
1726 		kfree_skb(skb);
1727 		return err;
1728 	}
1729 
1730 	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1731 			      n->nlmsg_flags & NLM_F_ECHO);
1732 }
1733 
1734 #ifdef CONFIG_NET_CLS
1735 
1736 struct tcf_bind_args {
1737 	struct tcf_walker w;
1738 	u32 classid;
1739 	unsigned long cl;
1740 };
1741 
1742 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1743 {
1744 	struct tcf_bind_args *a = (void *)arg;
1745 
1746 	if (tp->ops->bind_class) {
1747 		struct Qdisc *q = tcf_block_q(tp->chain->block);
1748 
1749 		sch_tree_lock(q);
1750 		tp->ops->bind_class(n, a->classid, a->cl);
1751 		sch_tree_unlock(q);
1752 	}
1753 	return 0;
1754 }
1755 
1756 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1757 			   unsigned long new_cl)
1758 {
1759 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1760 	struct tcf_block *block;
1761 	struct tcf_chain *chain;
1762 	unsigned long cl;
1763 
1764 	cl = cops->find(q, portid);
1765 	if (!cl)
1766 		return;
1767 	block = cops->tcf_block(q, cl, NULL);
1768 	if (!block)
1769 		return;
1770 	list_for_each_entry(chain, &block->chain_list, list) {
1771 		struct tcf_proto *tp;
1772 
1773 		for (tp = rtnl_dereference(chain->filter_chain);
1774 		     tp; tp = rtnl_dereference(tp->next)) {
1775 			struct tcf_bind_args arg = {};
1776 
1777 			arg.w.fn = tcf_node_bind;
1778 			arg.classid = clid;
1779 			arg.cl = new_cl;
1780 			tp->ops->walk(tp, &arg.w);
1781 		}
1782 	}
1783 }
1784 
1785 #else
1786 
1787 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1788 			   unsigned long new_cl)
1789 {
1790 }
1791 
1792 #endif
1793 
1794 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1795 			 struct netlink_ext_ack *extack)
1796 {
1797 	struct net *net = sock_net(skb->sk);
1798 	struct tcmsg *tcm = nlmsg_data(n);
1799 	struct nlattr *tca[TCA_MAX + 1];
1800 	struct net_device *dev;
1801 	struct Qdisc *q = NULL;
1802 	const struct Qdisc_class_ops *cops;
1803 	unsigned long cl = 0;
1804 	unsigned long new_cl;
1805 	u32 portid;
1806 	u32 clid;
1807 	u32 qid;
1808 	int err;
1809 
1810 	if ((n->nlmsg_type != RTM_GETTCLASS) &&
1811 	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1812 		return -EPERM;
1813 
1814 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
1815 	if (err < 0)
1816 		return err;
1817 
1818 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1819 	if (!dev)
1820 		return -ENODEV;
1821 
1822 	/*
1823 	   parent == TC_H_UNSPEC - unspecified parent.
1824 	   parent == TC_H_ROOT   - class is root, which has no parent.
1825 	   parent == X:0	 - parent is root class.
1826 	   parent == X:Y	 - parent is a node in hierarchy.
1827 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
1828 
1829 	   handle == 0:0	 - generate handle from kernel pool.
1830 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
1831 	   handle == X:Y	 - clear.
1832 	   handle == X:0	 - root class.
1833 	 */
1834 
1835 	/* Step 1. Determine qdisc handle X:0 */
1836 
1837 	portid = tcm->tcm_parent;
1838 	clid = tcm->tcm_handle;
1839 	qid = TC_H_MAJ(clid);
1840 
1841 	if (portid != TC_H_ROOT) {
1842 		u32 qid1 = TC_H_MAJ(portid);
1843 
1844 		if (qid && qid1) {
1845 			/* If both majors are known, they must be identical. */
1846 			if (qid != qid1)
1847 				return -EINVAL;
1848 		} else if (qid1) {
1849 			qid = qid1;
1850 		} else if (qid == 0)
1851 			qid = dev->qdisc->handle;
1852 
1853 		/* Now qid is genuine qdisc handle consistent
1854 		 * both with parent and child.
1855 		 *
1856 		 * TC_H_MAJ(portid) still may be unspecified, complete it now.
1857 		 */
1858 		if (portid)
1859 			portid = TC_H_MAKE(qid, portid);
1860 	} else {
1861 		if (qid == 0)
1862 			qid = dev->qdisc->handle;
1863 	}
1864 
1865 	/* OK. Locate qdisc */
1866 	q = qdisc_lookup(dev, qid);
1867 	if (!q)
1868 		return -ENOENT;
1869 
1870 	/* An check that it supports classes */
1871 	cops = q->ops->cl_ops;
1872 	if (cops == NULL)
1873 		return -EINVAL;
1874 
1875 	/* Now try to get class */
1876 	if (clid == 0) {
1877 		if (portid == TC_H_ROOT)
1878 			clid = qid;
1879 	} else
1880 		clid = TC_H_MAKE(qid, clid);
1881 
1882 	if (clid)
1883 		cl = cops->find(q, clid);
1884 
1885 	if (cl == 0) {
1886 		err = -ENOENT;
1887 		if (n->nlmsg_type != RTM_NEWTCLASS ||
1888 		    !(n->nlmsg_flags & NLM_F_CREATE))
1889 			goto out;
1890 	} else {
1891 		switch (n->nlmsg_type) {
1892 		case RTM_NEWTCLASS:
1893 			err = -EEXIST;
1894 			if (n->nlmsg_flags & NLM_F_EXCL)
1895 				goto out;
1896 			break;
1897 		case RTM_DELTCLASS:
1898 			err = tclass_del_notify(net, cops, skb, n, q, cl);
1899 			/* Unbind the class with flilters with 0 */
1900 			tc_bind_tclass(q, portid, clid, 0);
1901 			goto out;
1902 		case RTM_GETTCLASS:
1903 			err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1904 			goto out;
1905 		default:
1906 			err = -EINVAL;
1907 			goto out;
1908 		}
1909 	}
1910 
1911 	new_cl = cl;
1912 	err = -EOPNOTSUPP;
1913 	if (cops->change)
1914 		err = cops->change(q, clid, portid, tca, &new_cl, extack);
1915 	if (err == 0) {
1916 		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1917 		/* We just create a new class, need to do reverse binding. */
1918 		if (cl != new_cl)
1919 			tc_bind_tclass(q, portid, clid, new_cl);
1920 	}
1921 out:
1922 	return err;
1923 }
1924 
1925 struct qdisc_dump_args {
1926 	struct qdisc_walker	w;
1927 	struct sk_buff		*skb;
1928 	struct netlink_callback	*cb;
1929 };
1930 
1931 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
1932 			    struct qdisc_walker *arg)
1933 {
1934 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1935 
1936 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1937 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
1938 			      RTM_NEWTCLASS);
1939 }
1940 
1941 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1942 				struct tcmsg *tcm, struct netlink_callback *cb,
1943 				int *t_p, int s_t)
1944 {
1945 	struct qdisc_dump_args arg;
1946 
1947 	if (tc_qdisc_dump_ignore(q, false) ||
1948 	    *t_p < s_t || !q->ops->cl_ops ||
1949 	    (tcm->tcm_parent &&
1950 	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1951 		(*t_p)++;
1952 		return 0;
1953 	}
1954 	if (*t_p > s_t)
1955 		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1956 	arg.w.fn = qdisc_class_dump;
1957 	arg.skb = skb;
1958 	arg.cb = cb;
1959 	arg.w.stop  = 0;
1960 	arg.w.skip = cb->args[1];
1961 	arg.w.count = 0;
1962 	q->ops->cl_ops->walk(q, &arg.w);
1963 	cb->args[1] = arg.w.count;
1964 	if (arg.w.stop)
1965 		return -1;
1966 	(*t_p)++;
1967 	return 0;
1968 }
1969 
1970 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1971 			       struct tcmsg *tcm, struct netlink_callback *cb,
1972 			       int *t_p, int s_t)
1973 {
1974 	struct Qdisc *q;
1975 	int b;
1976 
1977 	if (!root)
1978 		return 0;
1979 
1980 	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1981 		return -1;
1982 
1983 	if (!qdisc_dev(root))
1984 		return 0;
1985 
1986 	if (tcm->tcm_parent) {
1987 		q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
1988 		if (q && tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1989 			return -1;
1990 		return 0;
1991 	}
1992 	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1993 		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1994 			return -1;
1995 	}
1996 
1997 	return 0;
1998 }
1999 
2000 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2001 {
2002 	struct tcmsg *tcm = nlmsg_data(cb->nlh);
2003 	struct net *net = sock_net(skb->sk);
2004 	struct netdev_queue *dev_queue;
2005 	struct net_device *dev;
2006 	int t, s_t;
2007 
2008 	if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2009 		return 0;
2010 	dev = dev_get_by_index(net, tcm->tcm_ifindex);
2011 	if (!dev)
2012 		return 0;
2013 
2014 	s_t = cb->args[0];
2015 	t = 0;
2016 
2017 	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
2018 		goto done;
2019 
2020 	dev_queue = dev_ingress_queue(dev);
2021 	if (dev_queue &&
2022 	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2023 				&t, s_t) < 0)
2024 		goto done;
2025 
2026 done:
2027 	cb->args[0] = t;
2028 
2029 	dev_put(dev);
2030 	return skb->len;
2031 }
2032 
2033 #ifdef CONFIG_PROC_FS
2034 static int psched_show(struct seq_file *seq, void *v)
2035 {
2036 	seq_printf(seq, "%08x %08x %08x %08x\n",
2037 		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2038 		   1000000,
2039 		   (u32)NSEC_PER_SEC / hrtimer_resolution);
2040 
2041 	return 0;
2042 }
2043 
2044 static int psched_open(struct inode *inode, struct file *file)
2045 {
2046 	return single_open(file, psched_show, NULL);
2047 }
2048 
2049 static const struct file_operations psched_fops = {
2050 	.owner = THIS_MODULE,
2051 	.open = psched_open,
2052 	.read  = seq_read,
2053 	.llseek = seq_lseek,
2054 	.release = single_release,
2055 };
2056 
2057 static int __net_init psched_net_init(struct net *net)
2058 {
2059 	struct proc_dir_entry *e;
2060 
2061 	e = proc_create("psched", 0, net->proc_net, &psched_fops);
2062 	if (e == NULL)
2063 		return -ENOMEM;
2064 
2065 	return 0;
2066 }
2067 
2068 static void __net_exit psched_net_exit(struct net *net)
2069 {
2070 	remove_proc_entry("psched", net->proc_net);
2071 }
2072 #else
2073 static int __net_init psched_net_init(struct net *net)
2074 {
2075 	return 0;
2076 }
2077 
2078 static void __net_exit psched_net_exit(struct net *net)
2079 {
2080 }
2081 #endif
2082 
2083 static struct pernet_operations psched_net_ops = {
2084 	.init = psched_net_init,
2085 	.exit = psched_net_exit,
2086 };
2087 
2088 static int __init pktsched_init(void)
2089 {
2090 	int err;
2091 
2092 	err = register_pernet_subsys(&psched_net_ops);
2093 	if (err) {
2094 		pr_err("pktsched_init: "
2095 		       "cannot initialize per netns operations\n");
2096 		return err;
2097 	}
2098 
2099 	register_qdisc(&pfifo_fast_ops);
2100 	register_qdisc(&pfifo_qdisc_ops);
2101 	register_qdisc(&bfifo_qdisc_ops);
2102 	register_qdisc(&pfifo_head_drop_qdisc_ops);
2103 	register_qdisc(&mq_qdisc_ops);
2104 	register_qdisc(&noqueue_qdisc_ops);
2105 
2106 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2107 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2108 	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2109 		      0);
2110 	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2111 	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2112 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2113 		      0);
2114 
2115 	return 0;
2116 }
2117 
2118 subsys_initcall(pktsched_init);
2119