xref: /linux/net/sched/sch_api.c (revision 24ce659dcc02c21f8d6c0a7589c3320a4dfa8152)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_api.c	Packet scheduler API.
4  *
5  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  *
7  * Fixes:
8  *
9  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
12  */
13 
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
28 
29 #include <net/net_namespace.h>
30 #include <net/sock.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
34 
35 /*
36 
37    Short review.
38    -------------
39 
40    This file consists of two interrelated parts:
41 
42    1. queueing disciplines manager frontend.
43    2. traffic classes manager frontend.
44 
45    Generally, queueing discipline ("qdisc") is a black box,
46    which is able to enqueue packets and to dequeue them (when
47    device is ready to send something) in order and at times
48    determined by algorithm hidden in it.
49 
50    qdisc's are divided to two categories:
51    - "queues", which have no internal structure visible from outside.
52    - "schedulers", which split all the packets to "traffic classes",
53      using "packet classifiers" (look at cls_api.c)
54 
55    In turn, classes may have child qdiscs (as rule, queues)
56    attached to them etc. etc. etc.
57 
58    The goal of the routines in this file is to translate
59    information supplied by user in the form of handles
60    to more intelligible for kernel form, to make some sanity
61    checks and part of work, which is common to all qdiscs
62    and to provide rtnetlink notifications.
63 
64    All real intelligent work is done inside qdisc modules.
65 
66 
67 
68    Every discipline has two major routines: enqueue and dequeue.
69 
70    ---dequeue
71 
72    dequeue usually returns a skb to send. It is allowed to return NULL,
73    but it does not mean that queue is empty, it just means that
74    discipline does not want to send anything this time.
75    Queue is really empty if q->q.qlen == 0.
76    For complicated disciplines with multiple queues q->q is not
77    real packet queue, but however q->q.qlen must be valid.
78 
79    ---enqueue
80 
81    enqueue returns 0, if packet was enqueued successfully.
82    If packet (this one or another one) was dropped, it returns
83    not zero error code.
84    NET_XMIT_DROP 	- this packet dropped
85      Expected action: do not backoff, but wait until queue will clear.
86    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
87      Expected action: backoff or ignore
88 
89    Auxiliary routines:
90 
91    ---peek
92 
93    like dequeue but without removing a packet from the queue
94 
95    ---reset
96 
97    returns qdisc to initial state: purge all buffers, clear all
98    timers, counters (except for statistics) etc.
99 
100    ---init
101 
102    initializes newly created qdisc.
103 
104    ---destroy
105 
106    destroys resources allocated by init and during lifetime of qdisc.
107 
108    ---change
109 
110    changes qdisc parameters.
111  */
112 
113 /* Protects list of registered TC modules. It is pure SMP lock. */
114 static DEFINE_RWLOCK(qdisc_mod_lock);
115 
116 
117 /************************************************
118  *	Queueing disciplines manipulation.	*
119  ************************************************/
120 
121 
122 /* The list of all installed queueing disciplines. */
123 
124 static struct Qdisc_ops *qdisc_base;
125 
126 /* Register/unregister queueing discipline */
127 
128 int register_qdisc(struct Qdisc_ops *qops)
129 {
130 	struct Qdisc_ops *q, **qp;
131 	int rc = -EEXIST;
132 
133 	write_lock(&qdisc_mod_lock);
134 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
135 		if (!strcmp(qops->id, q->id))
136 			goto out;
137 
138 	if (qops->enqueue == NULL)
139 		qops->enqueue = noop_qdisc_ops.enqueue;
140 	if (qops->peek == NULL) {
141 		if (qops->dequeue == NULL)
142 			qops->peek = noop_qdisc_ops.peek;
143 		else
144 			goto out_einval;
145 	}
146 	if (qops->dequeue == NULL)
147 		qops->dequeue = noop_qdisc_ops.dequeue;
148 
149 	if (qops->cl_ops) {
150 		const struct Qdisc_class_ops *cops = qops->cl_ops;
151 
152 		if (!(cops->find && cops->walk && cops->leaf))
153 			goto out_einval;
154 
155 		if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
156 			goto out_einval;
157 	}
158 
159 	qops->next = NULL;
160 	*qp = qops;
161 	rc = 0;
162 out:
163 	write_unlock(&qdisc_mod_lock);
164 	return rc;
165 
166 out_einval:
167 	rc = -EINVAL;
168 	goto out;
169 }
170 EXPORT_SYMBOL(register_qdisc);
171 
172 int unregister_qdisc(struct Qdisc_ops *qops)
173 {
174 	struct Qdisc_ops *q, **qp;
175 	int err = -ENOENT;
176 
177 	write_lock(&qdisc_mod_lock);
178 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
179 		if (q == qops)
180 			break;
181 	if (q) {
182 		*qp = q->next;
183 		q->next = NULL;
184 		err = 0;
185 	}
186 	write_unlock(&qdisc_mod_lock);
187 	return err;
188 }
189 EXPORT_SYMBOL(unregister_qdisc);
190 
191 /* Get default qdisc if not otherwise specified */
192 void qdisc_get_default(char *name, size_t len)
193 {
194 	read_lock(&qdisc_mod_lock);
195 	strlcpy(name, default_qdisc_ops->id, len);
196 	read_unlock(&qdisc_mod_lock);
197 }
198 
199 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
200 {
201 	struct Qdisc_ops *q = NULL;
202 
203 	for (q = qdisc_base; q; q = q->next) {
204 		if (!strcmp(name, q->id)) {
205 			if (!try_module_get(q->owner))
206 				q = NULL;
207 			break;
208 		}
209 	}
210 
211 	return q;
212 }
213 
214 /* Set new default qdisc to use */
215 int qdisc_set_default(const char *name)
216 {
217 	const struct Qdisc_ops *ops;
218 
219 	if (!capable(CAP_NET_ADMIN))
220 		return -EPERM;
221 
222 	write_lock(&qdisc_mod_lock);
223 	ops = qdisc_lookup_default(name);
224 	if (!ops) {
225 		/* Not found, drop lock and try to load module */
226 		write_unlock(&qdisc_mod_lock);
227 		request_module("sch_%s", name);
228 		write_lock(&qdisc_mod_lock);
229 
230 		ops = qdisc_lookup_default(name);
231 	}
232 
233 	if (ops) {
234 		/* Set new default */
235 		module_put(default_qdisc_ops->owner);
236 		default_qdisc_ops = ops;
237 	}
238 	write_unlock(&qdisc_mod_lock);
239 
240 	return ops ? 0 : -ENOENT;
241 }
242 
243 #ifdef CONFIG_NET_SCH_DEFAULT
244 /* Set default value from kernel config */
245 static int __init sch_default_qdisc(void)
246 {
247 	return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
248 }
249 late_initcall(sch_default_qdisc);
250 #endif
251 
252 /* We know handle. Find qdisc among all qdisc's attached to device
253  * (root qdisc, all its children, children of children etc.)
254  * Note: caller either uses rtnl or rcu_read_lock()
255  */
256 
257 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
258 {
259 	struct Qdisc *q;
260 
261 	if (!qdisc_dev(root))
262 		return (root->handle == handle ? root : NULL);
263 
264 	if (!(root->flags & TCQ_F_BUILTIN) &&
265 	    root->handle == handle)
266 		return root;
267 
268 	hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
269 		if (q->handle == handle)
270 			return q;
271 	}
272 	return NULL;
273 }
274 
275 void qdisc_hash_add(struct Qdisc *q, bool invisible)
276 {
277 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
278 		ASSERT_RTNL();
279 		hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
280 		if (invisible)
281 			q->flags |= TCQ_F_INVISIBLE;
282 	}
283 }
284 EXPORT_SYMBOL(qdisc_hash_add);
285 
286 void qdisc_hash_del(struct Qdisc *q)
287 {
288 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
289 		ASSERT_RTNL();
290 		hash_del_rcu(&q->hash);
291 	}
292 }
293 EXPORT_SYMBOL(qdisc_hash_del);
294 
295 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
296 {
297 	struct Qdisc *q;
298 
299 	if (!handle)
300 		return NULL;
301 	q = qdisc_match_from_root(dev->qdisc, handle);
302 	if (q)
303 		goto out;
304 
305 	if (dev_ingress_queue(dev))
306 		q = qdisc_match_from_root(
307 			dev_ingress_queue(dev)->qdisc_sleeping,
308 			handle);
309 out:
310 	return q;
311 }
312 
313 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
314 {
315 	struct netdev_queue *nq;
316 	struct Qdisc *q;
317 
318 	if (!handle)
319 		return NULL;
320 	q = qdisc_match_from_root(dev->qdisc, handle);
321 	if (q)
322 		goto out;
323 
324 	nq = dev_ingress_queue_rcu(dev);
325 	if (nq)
326 		q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
327 out:
328 	return q;
329 }
330 
331 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
332 {
333 	unsigned long cl;
334 	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
335 
336 	if (cops == NULL)
337 		return NULL;
338 	cl = cops->find(p, classid);
339 
340 	if (cl == 0)
341 		return NULL;
342 	return cops->leaf(p, cl);
343 }
344 
345 /* Find queueing discipline by name */
346 
347 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
348 {
349 	struct Qdisc_ops *q = NULL;
350 
351 	if (kind) {
352 		read_lock(&qdisc_mod_lock);
353 		for (q = qdisc_base; q; q = q->next) {
354 			if (nla_strcmp(kind, q->id) == 0) {
355 				if (!try_module_get(q->owner))
356 					q = NULL;
357 				break;
358 			}
359 		}
360 		read_unlock(&qdisc_mod_lock);
361 	}
362 	return q;
363 }
364 
365 /* The linklayer setting were not transferred from iproute2, in older
366  * versions, and the rate tables lookup systems have been dropped in
367  * the kernel. To keep backward compatible with older iproute2 tc
368  * utils, we detect the linklayer setting by detecting if the rate
369  * table were modified.
370  *
371  * For linklayer ATM table entries, the rate table will be aligned to
372  * 48 bytes, thus some table entries will contain the same value.  The
373  * mpu (min packet unit) is also encoded into the old rate table, thus
374  * starting from the mpu, we find low and high table entries for
375  * mapping this cell.  If these entries contain the same value, when
376  * the rate tables have been modified for linklayer ATM.
377  *
378  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
379  * and then roundup to the next cell, calc the table entry one below,
380  * and compare.
381  */
382 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
383 {
384 	int low       = roundup(r->mpu, 48);
385 	int high      = roundup(low+1, 48);
386 	int cell_low  = low >> r->cell_log;
387 	int cell_high = (high >> r->cell_log) - 1;
388 
389 	/* rtab is too inaccurate at rates > 100Mbit/s */
390 	if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
391 		pr_debug("TC linklayer: Giving up ATM detection\n");
392 		return TC_LINKLAYER_ETHERNET;
393 	}
394 
395 	if ((cell_high > cell_low) && (cell_high < 256)
396 	    && (rtab[cell_low] == rtab[cell_high])) {
397 		pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
398 			 cell_low, cell_high, rtab[cell_high]);
399 		return TC_LINKLAYER_ATM;
400 	}
401 	return TC_LINKLAYER_ETHERNET;
402 }
403 
404 static struct qdisc_rate_table *qdisc_rtab_list;
405 
406 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
407 					struct nlattr *tab,
408 					struct netlink_ext_ack *extack)
409 {
410 	struct qdisc_rate_table *rtab;
411 
412 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
413 	    nla_len(tab) != TC_RTAB_SIZE) {
414 		NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
415 		return NULL;
416 	}
417 
418 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
419 		if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
420 		    !memcmp(&rtab->data, nla_data(tab), 1024)) {
421 			rtab->refcnt++;
422 			return rtab;
423 		}
424 	}
425 
426 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
427 	if (rtab) {
428 		rtab->rate = *r;
429 		rtab->refcnt = 1;
430 		memcpy(rtab->data, nla_data(tab), 1024);
431 		if (r->linklayer == TC_LINKLAYER_UNAWARE)
432 			r->linklayer = __detect_linklayer(r, rtab->data);
433 		rtab->next = qdisc_rtab_list;
434 		qdisc_rtab_list = rtab;
435 	} else {
436 		NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
437 	}
438 	return rtab;
439 }
440 EXPORT_SYMBOL(qdisc_get_rtab);
441 
442 void qdisc_put_rtab(struct qdisc_rate_table *tab)
443 {
444 	struct qdisc_rate_table *rtab, **rtabp;
445 
446 	if (!tab || --tab->refcnt)
447 		return;
448 
449 	for (rtabp = &qdisc_rtab_list;
450 	     (rtab = *rtabp) != NULL;
451 	     rtabp = &rtab->next) {
452 		if (rtab == tab) {
453 			*rtabp = rtab->next;
454 			kfree(rtab);
455 			return;
456 		}
457 	}
458 }
459 EXPORT_SYMBOL(qdisc_put_rtab);
460 
461 static LIST_HEAD(qdisc_stab_list);
462 
463 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
464 	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
465 	[TCA_STAB_DATA] = { .type = NLA_BINARY },
466 };
467 
468 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
469 					       struct netlink_ext_ack *extack)
470 {
471 	struct nlattr *tb[TCA_STAB_MAX + 1];
472 	struct qdisc_size_table *stab;
473 	struct tc_sizespec *s;
474 	unsigned int tsize = 0;
475 	u16 *tab = NULL;
476 	int err;
477 
478 	err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
479 					  extack);
480 	if (err < 0)
481 		return ERR_PTR(err);
482 	if (!tb[TCA_STAB_BASE]) {
483 		NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
484 		return ERR_PTR(-EINVAL);
485 	}
486 
487 	s = nla_data(tb[TCA_STAB_BASE]);
488 
489 	if (s->tsize > 0) {
490 		if (!tb[TCA_STAB_DATA]) {
491 			NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
492 			return ERR_PTR(-EINVAL);
493 		}
494 		tab = nla_data(tb[TCA_STAB_DATA]);
495 		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
496 	}
497 
498 	if (tsize != s->tsize || (!tab && tsize > 0)) {
499 		NL_SET_ERR_MSG(extack, "Invalid size of size table");
500 		return ERR_PTR(-EINVAL);
501 	}
502 
503 	list_for_each_entry(stab, &qdisc_stab_list, list) {
504 		if (memcmp(&stab->szopts, s, sizeof(*s)))
505 			continue;
506 		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
507 			continue;
508 		stab->refcnt++;
509 		return stab;
510 	}
511 
512 	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
513 	if (!stab)
514 		return ERR_PTR(-ENOMEM);
515 
516 	stab->refcnt = 1;
517 	stab->szopts = *s;
518 	if (tsize > 0)
519 		memcpy(stab->data, tab, tsize * sizeof(u16));
520 
521 	list_add_tail(&stab->list, &qdisc_stab_list);
522 
523 	return stab;
524 }
525 
526 void qdisc_put_stab(struct qdisc_size_table *tab)
527 {
528 	if (!tab)
529 		return;
530 
531 	if (--tab->refcnt == 0) {
532 		list_del(&tab->list);
533 		kfree_rcu(tab, rcu);
534 	}
535 }
536 EXPORT_SYMBOL(qdisc_put_stab);
537 
538 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
539 {
540 	struct nlattr *nest;
541 
542 	nest = nla_nest_start_noflag(skb, TCA_STAB);
543 	if (nest == NULL)
544 		goto nla_put_failure;
545 	if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
546 		goto nla_put_failure;
547 	nla_nest_end(skb, nest);
548 
549 	return skb->len;
550 
551 nla_put_failure:
552 	return -1;
553 }
554 
555 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
556 			       const struct qdisc_size_table *stab)
557 {
558 	int pkt_len, slot;
559 
560 	pkt_len = skb->len + stab->szopts.overhead;
561 	if (unlikely(!stab->szopts.tsize))
562 		goto out;
563 
564 	slot = pkt_len + stab->szopts.cell_align;
565 	if (unlikely(slot < 0))
566 		slot = 0;
567 
568 	slot >>= stab->szopts.cell_log;
569 	if (likely(slot < stab->szopts.tsize))
570 		pkt_len = stab->data[slot];
571 	else
572 		pkt_len = stab->data[stab->szopts.tsize - 1] *
573 				(slot / stab->szopts.tsize) +
574 				stab->data[slot % stab->szopts.tsize];
575 
576 	pkt_len <<= stab->szopts.size_log;
577 out:
578 	if (unlikely(pkt_len < 1))
579 		pkt_len = 1;
580 	qdisc_skb_cb(skb)->pkt_len = pkt_len;
581 }
582 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
583 
584 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
585 {
586 	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
587 		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
588 			txt, qdisc->ops->id, qdisc->handle >> 16);
589 		qdisc->flags |= TCQ_F_WARN_NONWC;
590 	}
591 }
592 EXPORT_SYMBOL(qdisc_warn_nonwc);
593 
594 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
595 {
596 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
597 						 timer);
598 
599 	rcu_read_lock();
600 	__netif_schedule(qdisc_root(wd->qdisc));
601 	rcu_read_unlock();
602 
603 	return HRTIMER_NORESTART;
604 }
605 
606 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
607 				 clockid_t clockid)
608 {
609 	hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
610 	wd->timer.function = qdisc_watchdog;
611 	wd->qdisc = qdisc;
612 }
613 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
614 
615 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
616 {
617 	qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
618 }
619 EXPORT_SYMBOL(qdisc_watchdog_init);
620 
621 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
622 				      u64 delta_ns)
623 {
624 	if (test_bit(__QDISC_STATE_DEACTIVATED,
625 		     &qdisc_root_sleeping(wd->qdisc)->state))
626 		return;
627 
628 	if (hrtimer_is_queued(&wd->timer)) {
629 		/* If timer is already set in [expires, expires + delta_ns],
630 		 * do not reprogram it.
631 		 */
632 		if (wd->last_expires - expires <= delta_ns)
633 			return;
634 	}
635 
636 	wd->last_expires = expires;
637 	hrtimer_start_range_ns(&wd->timer,
638 			       ns_to_ktime(expires),
639 			       delta_ns,
640 			       HRTIMER_MODE_ABS_PINNED);
641 }
642 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
643 
644 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
645 {
646 	hrtimer_cancel(&wd->timer);
647 }
648 EXPORT_SYMBOL(qdisc_watchdog_cancel);
649 
650 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
651 {
652 	struct hlist_head *h;
653 	unsigned int i;
654 
655 	h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
656 
657 	if (h != NULL) {
658 		for (i = 0; i < n; i++)
659 			INIT_HLIST_HEAD(&h[i]);
660 	}
661 	return h;
662 }
663 
664 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
665 {
666 	struct Qdisc_class_common *cl;
667 	struct hlist_node *next;
668 	struct hlist_head *nhash, *ohash;
669 	unsigned int nsize, nmask, osize;
670 	unsigned int i, h;
671 
672 	/* Rehash when load factor exceeds 0.75 */
673 	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
674 		return;
675 	nsize = clhash->hashsize * 2;
676 	nmask = nsize - 1;
677 	nhash = qdisc_class_hash_alloc(nsize);
678 	if (nhash == NULL)
679 		return;
680 
681 	ohash = clhash->hash;
682 	osize = clhash->hashsize;
683 
684 	sch_tree_lock(sch);
685 	for (i = 0; i < osize; i++) {
686 		hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
687 			h = qdisc_class_hash(cl->classid, nmask);
688 			hlist_add_head(&cl->hnode, &nhash[h]);
689 		}
690 	}
691 	clhash->hash     = nhash;
692 	clhash->hashsize = nsize;
693 	clhash->hashmask = nmask;
694 	sch_tree_unlock(sch);
695 
696 	kvfree(ohash);
697 }
698 EXPORT_SYMBOL(qdisc_class_hash_grow);
699 
700 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
701 {
702 	unsigned int size = 4;
703 
704 	clhash->hash = qdisc_class_hash_alloc(size);
705 	if (!clhash->hash)
706 		return -ENOMEM;
707 	clhash->hashsize  = size;
708 	clhash->hashmask  = size - 1;
709 	clhash->hashelems = 0;
710 	return 0;
711 }
712 EXPORT_SYMBOL(qdisc_class_hash_init);
713 
714 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
715 {
716 	kvfree(clhash->hash);
717 }
718 EXPORT_SYMBOL(qdisc_class_hash_destroy);
719 
720 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
721 			     struct Qdisc_class_common *cl)
722 {
723 	unsigned int h;
724 
725 	INIT_HLIST_NODE(&cl->hnode);
726 	h = qdisc_class_hash(cl->classid, clhash->hashmask);
727 	hlist_add_head(&cl->hnode, &clhash->hash[h]);
728 	clhash->hashelems++;
729 }
730 EXPORT_SYMBOL(qdisc_class_hash_insert);
731 
732 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
733 			     struct Qdisc_class_common *cl)
734 {
735 	hlist_del(&cl->hnode);
736 	clhash->hashelems--;
737 }
738 EXPORT_SYMBOL(qdisc_class_hash_remove);
739 
740 /* Allocate an unique handle from space managed by kernel
741  * Possible range is [8000-FFFF]:0000 (0x8000 values)
742  */
743 static u32 qdisc_alloc_handle(struct net_device *dev)
744 {
745 	int i = 0x8000;
746 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
747 
748 	do {
749 		autohandle += TC_H_MAKE(0x10000U, 0);
750 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
751 			autohandle = TC_H_MAKE(0x80000000U, 0);
752 		if (!qdisc_lookup(dev, autohandle))
753 			return autohandle;
754 		cond_resched();
755 	} while	(--i > 0);
756 
757 	return 0;
758 }
759 
760 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
761 {
762 	bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
763 	const struct Qdisc_class_ops *cops;
764 	unsigned long cl;
765 	u32 parentid;
766 	bool notify;
767 	int drops;
768 
769 	if (n == 0 && len == 0)
770 		return;
771 	drops = max_t(int, n, 0);
772 	rcu_read_lock();
773 	while ((parentid = sch->parent)) {
774 		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
775 			break;
776 
777 		if (sch->flags & TCQ_F_NOPARENT)
778 			break;
779 		/* Notify parent qdisc only if child qdisc becomes empty.
780 		 *
781 		 * If child was empty even before update then backlog
782 		 * counter is screwed and we skip notification because
783 		 * parent class is already passive.
784 		 *
785 		 * If the original child was offloaded then it is allowed
786 		 * to be seem as empty, so the parent is notified anyway.
787 		 */
788 		notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
789 						       !qdisc_is_offloaded);
790 		/* TODO: perform the search on a per txq basis */
791 		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
792 		if (sch == NULL) {
793 			WARN_ON_ONCE(parentid != TC_H_ROOT);
794 			break;
795 		}
796 		cops = sch->ops->cl_ops;
797 		if (notify && cops->qlen_notify) {
798 			cl = cops->find(sch, parentid);
799 			cops->qlen_notify(sch, cl);
800 		}
801 		sch->q.qlen -= n;
802 		sch->qstats.backlog -= len;
803 		__qdisc_qstats_drop(sch, drops);
804 	}
805 	rcu_read_unlock();
806 }
807 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
808 
809 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
810 			      void *type_data)
811 {
812 	struct net_device *dev = qdisc_dev(sch);
813 	int err;
814 
815 	sch->flags &= ~TCQ_F_OFFLOADED;
816 	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
817 		return 0;
818 
819 	err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
820 	if (err == -EOPNOTSUPP)
821 		return 0;
822 
823 	if (!err)
824 		sch->flags |= TCQ_F_OFFLOADED;
825 
826 	return err;
827 }
828 EXPORT_SYMBOL(qdisc_offload_dump_helper);
829 
830 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
831 				struct Qdisc *new, struct Qdisc *old,
832 				enum tc_setup_type type, void *type_data,
833 				struct netlink_ext_ack *extack)
834 {
835 	bool any_qdisc_is_offloaded;
836 	int err;
837 
838 	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
839 		return;
840 
841 	err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
842 
843 	/* Don't report error if the graft is part of destroy operation. */
844 	if (!err || !new || new == &noop_qdisc)
845 		return;
846 
847 	/* Don't report error if the parent, the old child and the new
848 	 * one are not offloaded.
849 	 */
850 	any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
851 	any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
852 	any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
853 
854 	if (any_qdisc_is_offloaded)
855 		NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
856 }
857 EXPORT_SYMBOL(qdisc_offload_graft_helper);
858 
859 static void qdisc_offload_graft_root(struct net_device *dev,
860 				     struct Qdisc *new, struct Qdisc *old,
861 				     struct netlink_ext_ack *extack)
862 {
863 	struct tc_root_qopt_offload graft_offload = {
864 		.command	= TC_ROOT_GRAFT,
865 		.handle		= new ? new->handle : 0,
866 		.ingress	= (new && new->flags & TCQ_F_INGRESS) ||
867 				  (old && old->flags & TCQ_F_INGRESS),
868 	};
869 
870 	qdisc_offload_graft_helper(dev, NULL, new, old,
871 				   TC_SETUP_ROOT_QDISC, &graft_offload, extack);
872 }
873 
874 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
875 			 u32 portid, u32 seq, u16 flags, int event)
876 {
877 	struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
878 	struct gnet_stats_queue __percpu *cpu_qstats = NULL;
879 	struct tcmsg *tcm;
880 	struct nlmsghdr  *nlh;
881 	unsigned char *b = skb_tail_pointer(skb);
882 	struct gnet_dump d;
883 	struct qdisc_size_table *stab;
884 	u32 block_index;
885 	__u32 qlen;
886 
887 	cond_resched();
888 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
889 	if (!nlh)
890 		goto out_nlmsg_trim;
891 	tcm = nlmsg_data(nlh);
892 	tcm->tcm_family = AF_UNSPEC;
893 	tcm->tcm__pad1 = 0;
894 	tcm->tcm__pad2 = 0;
895 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
896 	tcm->tcm_parent = clid;
897 	tcm->tcm_handle = q->handle;
898 	tcm->tcm_info = refcount_read(&q->refcnt);
899 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
900 		goto nla_put_failure;
901 	if (q->ops->ingress_block_get) {
902 		block_index = q->ops->ingress_block_get(q);
903 		if (block_index &&
904 		    nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
905 			goto nla_put_failure;
906 	}
907 	if (q->ops->egress_block_get) {
908 		block_index = q->ops->egress_block_get(q);
909 		if (block_index &&
910 		    nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
911 			goto nla_put_failure;
912 	}
913 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
914 		goto nla_put_failure;
915 	if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
916 		goto nla_put_failure;
917 	qlen = qdisc_qlen_sum(q);
918 
919 	stab = rtnl_dereference(q->stab);
920 	if (stab && qdisc_dump_stab(skb, stab) < 0)
921 		goto nla_put_failure;
922 
923 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
924 					 NULL, &d, TCA_PAD) < 0)
925 		goto nla_put_failure;
926 
927 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
928 		goto nla_put_failure;
929 
930 	if (qdisc_is_percpu_stats(q)) {
931 		cpu_bstats = q->cpu_bstats;
932 		cpu_qstats = q->cpu_qstats;
933 	}
934 
935 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
936 				  &d, cpu_bstats, &q->bstats) < 0 ||
937 	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
938 	    gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
939 		goto nla_put_failure;
940 
941 	if (gnet_stats_finish_copy(&d) < 0)
942 		goto nla_put_failure;
943 
944 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
945 	return skb->len;
946 
947 out_nlmsg_trim:
948 nla_put_failure:
949 	nlmsg_trim(skb, b);
950 	return -1;
951 }
952 
953 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
954 {
955 	if (q->flags & TCQ_F_BUILTIN)
956 		return true;
957 	if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
958 		return true;
959 
960 	return false;
961 }
962 
963 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
964 			struct nlmsghdr *n, u32 clid,
965 			struct Qdisc *old, struct Qdisc *new)
966 {
967 	struct sk_buff *skb;
968 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
969 
970 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
971 	if (!skb)
972 		return -ENOBUFS;
973 
974 	if (old && !tc_qdisc_dump_ignore(old, false)) {
975 		if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
976 				  0, RTM_DELQDISC) < 0)
977 			goto err_out;
978 	}
979 	if (new && !tc_qdisc_dump_ignore(new, false)) {
980 		if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
981 				  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
982 			goto err_out;
983 	}
984 
985 	if (skb->len)
986 		return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
987 				      n->nlmsg_flags & NLM_F_ECHO);
988 
989 err_out:
990 	kfree_skb(skb);
991 	return -EINVAL;
992 }
993 
994 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
995 			       struct nlmsghdr *n, u32 clid,
996 			       struct Qdisc *old, struct Qdisc *new)
997 {
998 	if (new || old)
999 		qdisc_notify(net, skb, n, clid, old, new);
1000 
1001 	if (old)
1002 		qdisc_put(old);
1003 }
1004 
1005 static void qdisc_clear_nolock(struct Qdisc *sch)
1006 {
1007 	sch->flags &= ~TCQ_F_NOLOCK;
1008 	if (!(sch->flags & TCQ_F_CPUSTATS))
1009 		return;
1010 
1011 	free_percpu(sch->cpu_bstats);
1012 	free_percpu(sch->cpu_qstats);
1013 	sch->cpu_bstats = NULL;
1014 	sch->cpu_qstats = NULL;
1015 	sch->flags &= ~TCQ_F_CPUSTATS;
1016 }
1017 
1018 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1019  * to device "dev".
1020  *
1021  * When appropriate send a netlink notification using 'skb'
1022  * and "n".
1023  *
1024  * On success, destroy old qdisc.
1025  */
1026 
1027 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1028 		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1029 		       struct Qdisc *new, struct Qdisc *old,
1030 		       struct netlink_ext_ack *extack)
1031 {
1032 	struct Qdisc *q = old;
1033 	struct net *net = dev_net(dev);
1034 
1035 	if (parent == NULL) {
1036 		unsigned int i, num_q, ingress;
1037 
1038 		ingress = 0;
1039 		num_q = dev->num_tx_queues;
1040 		if ((q && q->flags & TCQ_F_INGRESS) ||
1041 		    (new && new->flags & TCQ_F_INGRESS)) {
1042 			num_q = 1;
1043 			ingress = 1;
1044 			if (!dev_ingress_queue(dev)) {
1045 				NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1046 				return -ENOENT;
1047 			}
1048 		}
1049 
1050 		if (dev->flags & IFF_UP)
1051 			dev_deactivate(dev);
1052 
1053 		qdisc_offload_graft_root(dev, new, old, extack);
1054 
1055 		if (new && new->ops->attach)
1056 			goto skip;
1057 
1058 		for (i = 0; i < num_q; i++) {
1059 			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1060 
1061 			if (!ingress)
1062 				dev_queue = netdev_get_tx_queue(dev, i);
1063 
1064 			old = dev_graft_qdisc(dev_queue, new);
1065 			if (new && i > 0)
1066 				qdisc_refcount_inc(new);
1067 
1068 			if (!ingress)
1069 				qdisc_put(old);
1070 		}
1071 
1072 skip:
1073 		if (!ingress) {
1074 			notify_and_destroy(net, skb, n, classid,
1075 					   dev->qdisc, new);
1076 			if (new && !new->ops->attach)
1077 				qdisc_refcount_inc(new);
1078 			dev->qdisc = new ? : &noop_qdisc;
1079 
1080 			if (new && new->ops->attach)
1081 				new->ops->attach(new);
1082 		} else {
1083 			notify_and_destroy(net, skb, n, classid, old, new);
1084 		}
1085 
1086 		if (dev->flags & IFF_UP)
1087 			dev_activate(dev);
1088 	} else {
1089 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1090 		unsigned long cl;
1091 		int err;
1092 
1093 		/* Only support running class lockless if parent is lockless */
1094 		if (new && (new->flags & TCQ_F_NOLOCK) &&
1095 		    parent && !(parent->flags & TCQ_F_NOLOCK))
1096 			qdisc_clear_nolock(new);
1097 
1098 		if (!cops || !cops->graft)
1099 			return -EOPNOTSUPP;
1100 
1101 		cl = cops->find(parent, classid);
1102 		if (!cl) {
1103 			NL_SET_ERR_MSG(extack, "Specified class not found");
1104 			return -ENOENT;
1105 		}
1106 
1107 		err = cops->graft(parent, cl, new, &old, extack);
1108 		if (err)
1109 			return err;
1110 		notify_and_destroy(net, skb, n, classid, old, new);
1111 	}
1112 	return 0;
1113 }
1114 
1115 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1116 				   struct netlink_ext_ack *extack)
1117 {
1118 	u32 block_index;
1119 
1120 	if (tca[TCA_INGRESS_BLOCK]) {
1121 		block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1122 
1123 		if (!block_index) {
1124 			NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1125 			return -EINVAL;
1126 		}
1127 		if (!sch->ops->ingress_block_set) {
1128 			NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1129 			return -EOPNOTSUPP;
1130 		}
1131 		sch->ops->ingress_block_set(sch, block_index);
1132 	}
1133 	if (tca[TCA_EGRESS_BLOCK]) {
1134 		block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1135 
1136 		if (!block_index) {
1137 			NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1138 			return -EINVAL;
1139 		}
1140 		if (!sch->ops->egress_block_set) {
1141 			NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1142 			return -EOPNOTSUPP;
1143 		}
1144 		sch->ops->egress_block_set(sch, block_index);
1145 	}
1146 	return 0;
1147 }
1148 
1149 /*
1150    Allocate and initialize new qdisc.
1151 
1152    Parameters are passed via opt.
1153  */
1154 
1155 static struct Qdisc *qdisc_create(struct net_device *dev,
1156 				  struct netdev_queue *dev_queue,
1157 				  struct Qdisc *p, u32 parent, u32 handle,
1158 				  struct nlattr **tca, int *errp,
1159 				  struct netlink_ext_ack *extack)
1160 {
1161 	int err;
1162 	struct nlattr *kind = tca[TCA_KIND];
1163 	struct Qdisc *sch;
1164 	struct Qdisc_ops *ops;
1165 	struct qdisc_size_table *stab;
1166 
1167 	ops = qdisc_lookup_ops(kind);
1168 #ifdef CONFIG_MODULES
1169 	if (ops == NULL && kind != NULL) {
1170 		char name[IFNAMSIZ];
1171 		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1172 			/* We dropped the RTNL semaphore in order to
1173 			 * perform the module load.  So, even if we
1174 			 * succeeded in loading the module we have to
1175 			 * tell the caller to replay the request.  We
1176 			 * indicate this using -EAGAIN.
1177 			 * We replay the request because the device may
1178 			 * go away in the mean time.
1179 			 */
1180 			rtnl_unlock();
1181 			request_module("sch_%s", name);
1182 			rtnl_lock();
1183 			ops = qdisc_lookup_ops(kind);
1184 			if (ops != NULL) {
1185 				/* We will try again qdisc_lookup_ops,
1186 				 * so don't keep a reference.
1187 				 */
1188 				module_put(ops->owner);
1189 				err = -EAGAIN;
1190 				goto err_out;
1191 			}
1192 		}
1193 	}
1194 #endif
1195 
1196 	err = -ENOENT;
1197 	if (!ops) {
1198 		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1199 		goto err_out;
1200 	}
1201 
1202 	sch = qdisc_alloc(dev_queue, ops, extack);
1203 	if (IS_ERR(sch)) {
1204 		err = PTR_ERR(sch);
1205 		goto err_out2;
1206 	}
1207 
1208 	sch->parent = parent;
1209 
1210 	if (handle == TC_H_INGRESS) {
1211 		sch->flags |= TCQ_F_INGRESS;
1212 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
1213 	} else {
1214 		if (handle == 0) {
1215 			handle = qdisc_alloc_handle(dev);
1216 			if (handle == 0) {
1217 				NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1218 				err = -ENOSPC;
1219 				goto err_out3;
1220 			}
1221 		}
1222 		if (!netif_is_multiqueue(dev))
1223 			sch->flags |= TCQ_F_ONETXQUEUE;
1224 	}
1225 
1226 	sch->handle = handle;
1227 
1228 	/* This exist to keep backward compatible with a userspace
1229 	 * loophole, what allowed userspace to get IFF_NO_QUEUE
1230 	 * facility on older kernels by setting tx_queue_len=0 (prior
1231 	 * to qdisc init), and then forgot to reinit tx_queue_len
1232 	 * before again attaching a qdisc.
1233 	 */
1234 	if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1235 		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1236 		netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1237 	}
1238 
1239 	err = qdisc_block_indexes_set(sch, tca, extack);
1240 	if (err)
1241 		goto err_out3;
1242 
1243 	if (ops->init) {
1244 		err = ops->init(sch, tca[TCA_OPTIONS], extack);
1245 		if (err != 0)
1246 			goto err_out5;
1247 	}
1248 
1249 	if (tca[TCA_STAB]) {
1250 		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1251 		if (IS_ERR(stab)) {
1252 			err = PTR_ERR(stab);
1253 			goto err_out4;
1254 		}
1255 		rcu_assign_pointer(sch->stab, stab);
1256 	}
1257 	if (tca[TCA_RATE]) {
1258 		seqcount_t *running;
1259 
1260 		err = -EOPNOTSUPP;
1261 		if (sch->flags & TCQ_F_MQROOT) {
1262 			NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1263 			goto err_out4;
1264 		}
1265 
1266 		if (sch->parent != TC_H_ROOT &&
1267 		    !(sch->flags & TCQ_F_INGRESS) &&
1268 		    (!p || !(p->flags & TCQ_F_MQROOT)))
1269 			running = qdisc_root_sleeping_running(sch);
1270 		else
1271 			running = &sch->running;
1272 
1273 		err = gen_new_estimator(&sch->bstats,
1274 					sch->cpu_bstats,
1275 					&sch->rate_est,
1276 					NULL,
1277 					running,
1278 					tca[TCA_RATE]);
1279 		if (err) {
1280 			NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1281 			goto err_out4;
1282 		}
1283 	}
1284 
1285 	qdisc_hash_add(sch, false);
1286 
1287 	return sch;
1288 
1289 err_out5:
1290 	/* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1291 	if (ops->destroy)
1292 		ops->destroy(sch);
1293 err_out3:
1294 	dev_put(dev);
1295 	qdisc_free(sch);
1296 err_out2:
1297 	module_put(ops->owner);
1298 err_out:
1299 	*errp = err;
1300 	return NULL;
1301 
1302 err_out4:
1303 	/*
1304 	 * Any broken qdiscs that would require a ops->reset() here?
1305 	 * The qdisc was never in action so it shouldn't be necessary.
1306 	 */
1307 	qdisc_put_stab(rtnl_dereference(sch->stab));
1308 	if (ops->destroy)
1309 		ops->destroy(sch);
1310 	goto err_out3;
1311 }
1312 
1313 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1314 			struct netlink_ext_ack *extack)
1315 {
1316 	struct qdisc_size_table *ostab, *stab = NULL;
1317 	int err = 0;
1318 
1319 	if (tca[TCA_OPTIONS]) {
1320 		if (!sch->ops->change) {
1321 			NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1322 			return -EINVAL;
1323 		}
1324 		if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1325 			NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1326 			return -EOPNOTSUPP;
1327 		}
1328 		err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1329 		if (err)
1330 			return err;
1331 	}
1332 
1333 	if (tca[TCA_STAB]) {
1334 		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1335 		if (IS_ERR(stab))
1336 			return PTR_ERR(stab);
1337 	}
1338 
1339 	ostab = rtnl_dereference(sch->stab);
1340 	rcu_assign_pointer(sch->stab, stab);
1341 	qdisc_put_stab(ostab);
1342 
1343 	if (tca[TCA_RATE]) {
1344 		/* NB: ignores errors from replace_estimator
1345 		   because change can't be undone. */
1346 		if (sch->flags & TCQ_F_MQROOT)
1347 			goto out;
1348 		gen_replace_estimator(&sch->bstats,
1349 				      sch->cpu_bstats,
1350 				      &sch->rate_est,
1351 				      NULL,
1352 				      qdisc_root_sleeping_running(sch),
1353 				      tca[TCA_RATE]);
1354 	}
1355 out:
1356 	return 0;
1357 }
1358 
1359 struct check_loop_arg {
1360 	struct qdisc_walker	w;
1361 	struct Qdisc		*p;
1362 	int			depth;
1363 };
1364 
1365 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1366 			 struct qdisc_walker *w);
1367 
1368 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1369 {
1370 	struct check_loop_arg	arg;
1371 
1372 	if (q->ops->cl_ops == NULL)
1373 		return 0;
1374 
1375 	arg.w.stop = arg.w.skip = arg.w.count = 0;
1376 	arg.w.fn = check_loop_fn;
1377 	arg.depth = depth;
1378 	arg.p = p;
1379 	q->ops->cl_ops->walk(q, &arg.w);
1380 	return arg.w.stop ? -ELOOP : 0;
1381 }
1382 
1383 static int
1384 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1385 {
1386 	struct Qdisc *leaf;
1387 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1388 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
1389 
1390 	leaf = cops->leaf(q, cl);
1391 	if (leaf) {
1392 		if (leaf == arg->p || arg->depth > 7)
1393 			return -ELOOP;
1394 		return check_loop(leaf, arg->p, arg->depth + 1);
1395 	}
1396 	return 0;
1397 }
1398 
1399 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1400 	[TCA_KIND]		= { .type = NLA_STRING },
1401 	[TCA_RATE]		= { .type = NLA_BINARY,
1402 				    .len = sizeof(struct tc_estimator) },
1403 	[TCA_STAB]		= { .type = NLA_NESTED },
1404 	[TCA_DUMP_INVISIBLE]	= { .type = NLA_FLAG },
1405 	[TCA_CHAIN]		= { .type = NLA_U32 },
1406 	[TCA_INGRESS_BLOCK]	= { .type = NLA_U32 },
1407 	[TCA_EGRESS_BLOCK]	= { .type = NLA_U32 },
1408 };
1409 
1410 /*
1411  * Delete/get qdisc.
1412  */
1413 
1414 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1415 			struct netlink_ext_ack *extack)
1416 {
1417 	struct net *net = sock_net(skb->sk);
1418 	struct tcmsg *tcm = nlmsg_data(n);
1419 	struct nlattr *tca[TCA_MAX + 1];
1420 	struct net_device *dev;
1421 	u32 clid;
1422 	struct Qdisc *q = NULL;
1423 	struct Qdisc *p = NULL;
1424 	int err;
1425 
1426 	if ((n->nlmsg_type != RTM_GETQDISC) &&
1427 	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1428 		return -EPERM;
1429 
1430 	err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1431 				     rtm_tca_policy, extack);
1432 	if (err < 0)
1433 		return err;
1434 
1435 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1436 	if (!dev)
1437 		return -ENODEV;
1438 
1439 	clid = tcm->tcm_parent;
1440 	if (clid) {
1441 		if (clid != TC_H_ROOT) {
1442 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1443 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1444 				if (!p) {
1445 					NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1446 					return -ENOENT;
1447 				}
1448 				q = qdisc_leaf(p, clid);
1449 			} else if (dev_ingress_queue(dev)) {
1450 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1451 			}
1452 		} else {
1453 			q = dev->qdisc;
1454 		}
1455 		if (!q) {
1456 			NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1457 			return -ENOENT;
1458 		}
1459 
1460 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1461 			NL_SET_ERR_MSG(extack, "Invalid handle");
1462 			return -EINVAL;
1463 		}
1464 	} else {
1465 		q = qdisc_lookup(dev, tcm->tcm_handle);
1466 		if (!q) {
1467 			NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1468 			return -ENOENT;
1469 		}
1470 	}
1471 
1472 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1473 		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1474 		return -EINVAL;
1475 	}
1476 
1477 	if (n->nlmsg_type == RTM_DELQDISC) {
1478 		if (!clid) {
1479 			NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1480 			return -EINVAL;
1481 		}
1482 		if (q->handle == 0) {
1483 			NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1484 			return -ENOENT;
1485 		}
1486 		err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1487 		if (err != 0)
1488 			return err;
1489 	} else {
1490 		qdisc_notify(net, skb, n, clid, NULL, q);
1491 	}
1492 	return 0;
1493 }
1494 
1495 /*
1496  * Create/change qdisc.
1497  */
1498 
1499 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1500 			   struct netlink_ext_ack *extack)
1501 {
1502 	struct net *net = sock_net(skb->sk);
1503 	struct tcmsg *tcm;
1504 	struct nlattr *tca[TCA_MAX + 1];
1505 	struct net_device *dev;
1506 	u32 clid;
1507 	struct Qdisc *q, *p;
1508 	int err;
1509 
1510 	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1511 		return -EPERM;
1512 
1513 replay:
1514 	/* Reinit, just in case something touches this. */
1515 	err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1516 				     rtm_tca_policy, extack);
1517 	if (err < 0)
1518 		return err;
1519 
1520 	tcm = nlmsg_data(n);
1521 	clid = tcm->tcm_parent;
1522 	q = p = NULL;
1523 
1524 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1525 	if (!dev)
1526 		return -ENODEV;
1527 
1528 
1529 	if (clid) {
1530 		if (clid != TC_H_ROOT) {
1531 			if (clid != TC_H_INGRESS) {
1532 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1533 				if (!p) {
1534 					NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1535 					return -ENOENT;
1536 				}
1537 				q = qdisc_leaf(p, clid);
1538 			} else if (dev_ingress_queue_create(dev)) {
1539 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1540 			}
1541 		} else {
1542 			q = dev->qdisc;
1543 		}
1544 
1545 		/* It may be default qdisc, ignore it */
1546 		if (q && q->handle == 0)
1547 			q = NULL;
1548 
1549 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1550 			if (tcm->tcm_handle) {
1551 				if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1552 					NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1553 					return -EEXIST;
1554 				}
1555 				if (TC_H_MIN(tcm->tcm_handle)) {
1556 					NL_SET_ERR_MSG(extack, "Invalid minor handle");
1557 					return -EINVAL;
1558 				}
1559 				q = qdisc_lookup(dev, tcm->tcm_handle);
1560 				if (!q)
1561 					goto create_n_graft;
1562 				if (n->nlmsg_flags & NLM_F_EXCL) {
1563 					NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1564 					return -EEXIST;
1565 				}
1566 				if (tca[TCA_KIND] &&
1567 				    nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1568 					NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1569 					return -EINVAL;
1570 				}
1571 				if (q == p ||
1572 				    (p && check_loop(q, p, 0))) {
1573 					NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1574 					return -ELOOP;
1575 				}
1576 				qdisc_refcount_inc(q);
1577 				goto graft;
1578 			} else {
1579 				if (!q)
1580 					goto create_n_graft;
1581 
1582 				/* This magic test requires explanation.
1583 				 *
1584 				 *   We know, that some child q is already
1585 				 *   attached to this parent and have choice:
1586 				 *   either to change it or to create/graft new one.
1587 				 *
1588 				 *   1. We are allowed to create/graft only
1589 				 *   if CREATE and REPLACE flags are set.
1590 				 *
1591 				 *   2. If EXCL is set, requestor wanted to say,
1592 				 *   that qdisc tcm_handle is not expected
1593 				 *   to exist, so that we choose create/graft too.
1594 				 *
1595 				 *   3. The last case is when no flags are set.
1596 				 *   Alas, it is sort of hole in API, we
1597 				 *   cannot decide what to do unambiguously.
1598 				 *   For now we select create/graft, if
1599 				 *   user gave KIND, which does not match existing.
1600 				 */
1601 				if ((n->nlmsg_flags & NLM_F_CREATE) &&
1602 				    (n->nlmsg_flags & NLM_F_REPLACE) &&
1603 				    ((n->nlmsg_flags & NLM_F_EXCL) ||
1604 				     (tca[TCA_KIND] &&
1605 				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1606 					goto create_n_graft;
1607 			}
1608 		}
1609 	} else {
1610 		if (!tcm->tcm_handle) {
1611 			NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1612 			return -EINVAL;
1613 		}
1614 		q = qdisc_lookup(dev, tcm->tcm_handle);
1615 	}
1616 
1617 	/* Change qdisc parameters */
1618 	if (!q) {
1619 		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1620 		return -ENOENT;
1621 	}
1622 	if (n->nlmsg_flags & NLM_F_EXCL) {
1623 		NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1624 		return -EEXIST;
1625 	}
1626 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1627 		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1628 		return -EINVAL;
1629 	}
1630 	err = qdisc_change(q, tca, extack);
1631 	if (err == 0)
1632 		qdisc_notify(net, skb, n, clid, NULL, q);
1633 	return err;
1634 
1635 create_n_graft:
1636 	if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1637 		NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1638 		return -ENOENT;
1639 	}
1640 	if (clid == TC_H_INGRESS) {
1641 		if (dev_ingress_queue(dev)) {
1642 			q = qdisc_create(dev, dev_ingress_queue(dev), p,
1643 					 tcm->tcm_parent, tcm->tcm_parent,
1644 					 tca, &err, extack);
1645 		} else {
1646 			NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1647 			err = -ENOENT;
1648 		}
1649 	} else {
1650 		struct netdev_queue *dev_queue;
1651 
1652 		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1653 			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1654 		else if (p)
1655 			dev_queue = p->dev_queue;
1656 		else
1657 			dev_queue = netdev_get_tx_queue(dev, 0);
1658 
1659 		q = qdisc_create(dev, dev_queue, p,
1660 				 tcm->tcm_parent, tcm->tcm_handle,
1661 				 tca, &err, extack);
1662 	}
1663 	if (q == NULL) {
1664 		if (err == -EAGAIN)
1665 			goto replay;
1666 		return err;
1667 	}
1668 
1669 graft:
1670 	err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1671 	if (err) {
1672 		if (q)
1673 			qdisc_put(q);
1674 		return err;
1675 	}
1676 
1677 	return 0;
1678 }
1679 
1680 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1681 			      struct netlink_callback *cb,
1682 			      int *q_idx_p, int s_q_idx, bool recur,
1683 			      bool dump_invisible)
1684 {
1685 	int ret = 0, q_idx = *q_idx_p;
1686 	struct Qdisc *q;
1687 	int b;
1688 
1689 	if (!root)
1690 		return 0;
1691 
1692 	q = root;
1693 	if (q_idx < s_q_idx) {
1694 		q_idx++;
1695 	} else {
1696 		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1697 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1698 				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1699 				  RTM_NEWQDISC) <= 0)
1700 			goto done;
1701 		q_idx++;
1702 	}
1703 
1704 	/* If dumping singletons, there is no qdisc_dev(root) and the singleton
1705 	 * itself has already been dumped.
1706 	 *
1707 	 * If we've already dumped the top-level (ingress) qdisc above and the global
1708 	 * qdisc hashtable, we don't want to hit it again
1709 	 */
1710 	if (!qdisc_dev(root) || !recur)
1711 		goto out;
1712 
1713 	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1714 		if (q_idx < s_q_idx) {
1715 			q_idx++;
1716 			continue;
1717 		}
1718 		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1719 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1720 				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1721 				  RTM_NEWQDISC) <= 0)
1722 			goto done;
1723 		q_idx++;
1724 	}
1725 
1726 out:
1727 	*q_idx_p = q_idx;
1728 	return ret;
1729 done:
1730 	ret = -1;
1731 	goto out;
1732 }
1733 
1734 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1735 {
1736 	struct net *net = sock_net(skb->sk);
1737 	int idx, q_idx;
1738 	int s_idx, s_q_idx;
1739 	struct net_device *dev;
1740 	const struct nlmsghdr *nlh = cb->nlh;
1741 	struct nlattr *tca[TCA_MAX + 1];
1742 	int err;
1743 
1744 	s_idx = cb->args[0];
1745 	s_q_idx = q_idx = cb->args[1];
1746 
1747 	idx = 0;
1748 	ASSERT_RTNL();
1749 
1750 	err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1751 				     rtm_tca_policy, cb->extack);
1752 	if (err < 0)
1753 		return err;
1754 
1755 	for_each_netdev(net, dev) {
1756 		struct netdev_queue *dev_queue;
1757 
1758 		if (idx < s_idx)
1759 			goto cont;
1760 		if (idx > s_idx)
1761 			s_q_idx = 0;
1762 		q_idx = 0;
1763 
1764 		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1765 				       true, tca[TCA_DUMP_INVISIBLE]) < 0)
1766 			goto done;
1767 
1768 		dev_queue = dev_ingress_queue(dev);
1769 		if (dev_queue &&
1770 		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1771 				       &q_idx, s_q_idx, false,
1772 				       tca[TCA_DUMP_INVISIBLE]) < 0)
1773 			goto done;
1774 
1775 cont:
1776 		idx++;
1777 	}
1778 
1779 done:
1780 	cb->args[0] = idx;
1781 	cb->args[1] = q_idx;
1782 
1783 	return skb->len;
1784 }
1785 
1786 
1787 
1788 /************************************************
1789  *	Traffic classes manipulation.		*
1790  ************************************************/
1791 
1792 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1793 			  unsigned long cl,
1794 			  u32 portid, u32 seq, u16 flags, int event)
1795 {
1796 	struct tcmsg *tcm;
1797 	struct nlmsghdr  *nlh;
1798 	unsigned char *b = skb_tail_pointer(skb);
1799 	struct gnet_dump d;
1800 	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1801 
1802 	cond_resched();
1803 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1804 	if (!nlh)
1805 		goto out_nlmsg_trim;
1806 	tcm = nlmsg_data(nlh);
1807 	tcm->tcm_family = AF_UNSPEC;
1808 	tcm->tcm__pad1 = 0;
1809 	tcm->tcm__pad2 = 0;
1810 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1811 	tcm->tcm_parent = q->handle;
1812 	tcm->tcm_handle = q->handle;
1813 	tcm->tcm_info = 0;
1814 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
1815 		goto nla_put_failure;
1816 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1817 		goto nla_put_failure;
1818 
1819 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1820 					 NULL, &d, TCA_PAD) < 0)
1821 		goto nla_put_failure;
1822 
1823 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1824 		goto nla_put_failure;
1825 
1826 	if (gnet_stats_finish_copy(&d) < 0)
1827 		goto nla_put_failure;
1828 
1829 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1830 	return skb->len;
1831 
1832 out_nlmsg_trim:
1833 nla_put_failure:
1834 	nlmsg_trim(skb, b);
1835 	return -1;
1836 }
1837 
1838 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1839 			 struct nlmsghdr *n, struct Qdisc *q,
1840 			 unsigned long cl, int event)
1841 {
1842 	struct sk_buff *skb;
1843 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1844 	int err = 0;
1845 
1846 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1847 	if (!skb)
1848 		return -ENOBUFS;
1849 
1850 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1851 		kfree_skb(skb);
1852 		return -EINVAL;
1853 	}
1854 
1855 	err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1856 			     n->nlmsg_flags & NLM_F_ECHO);
1857 	if (err > 0)
1858 		err = 0;
1859 	return err;
1860 }
1861 
1862 static int tclass_del_notify(struct net *net,
1863 			     const struct Qdisc_class_ops *cops,
1864 			     struct sk_buff *oskb, struct nlmsghdr *n,
1865 			     struct Qdisc *q, unsigned long cl)
1866 {
1867 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1868 	struct sk_buff *skb;
1869 	int err = 0;
1870 
1871 	if (!cops->delete)
1872 		return -EOPNOTSUPP;
1873 
1874 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1875 	if (!skb)
1876 		return -ENOBUFS;
1877 
1878 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1879 			   RTM_DELTCLASS) < 0) {
1880 		kfree_skb(skb);
1881 		return -EINVAL;
1882 	}
1883 
1884 	err = cops->delete(q, cl);
1885 	if (err) {
1886 		kfree_skb(skb);
1887 		return err;
1888 	}
1889 
1890 	err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1891 			     n->nlmsg_flags & NLM_F_ECHO);
1892 	if (err > 0)
1893 		err = 0;
1894 	return err;
1895 }
1896 
1897 #ifdef CONFIG_NET_CLS
1898 
1899 struct tcf_bind_args {
1900 	struct tcf_walker w;
1901 	unsigned long base;
1902 	unsigned long cl;
1903 	u32 classid;
1904 };
1905 
1906 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1907 {
1908 	struct tcf_bind_args *a = (void *)arg;
1909 
1910 	if (tp->ops->bind_class) {
1911 		struct Qdisc *q = tcf_block_q(tp->chain->block);
1912 
1913 		sch_tree_lock(q);
1914 		tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1915 		sch_tree_unlock(q);
1916 	}
1917 	return 0;
1918 }
1919 
1920 struct tc_bind_class_args {
1921 	struct qdisc_walker w;
1922 	unsigned long new_cl;
1923 	u32 portid;
1924 	u32 clid;
1925 };
1926 
1927 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
1928 				struct qdisc_walker *w)
1929 {
1930 	struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
1931 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1932 	struct tcf_block *block;
1933 	struct tcf_chain *chain;
1934 
1935 	block = cops->tcf_block(q, cl, NULL);
1936 	if (!block)
1937 		return 0;
1938 	for (chain = tcf_get_next_chain(block, NULL);
1939 	     chain;
1940 	     chain = tcf_get_next_chain(block, chain)) {
1941 		struct tcf_proto *tp;
1942 
1943 		for (tp = tcf_get_next_proto(chain, NULL, true);
1944 		     tp; tp = tcf_get_next_proto(chain, tp, true)) {
1945 			struct tcf_bind_args arg = {};
1946 
1947 			arg.w.fn = tcf_node_bind;
1948 			arg.classid = a->clid;
1949 			arg.base = cl;
1950 			arg.cl = a->new_cl;
1951 			tp->ops->walk(tp, &arg.w, true);
1952 		}
1953 	}
1954 
1955 	return 0;
1956 }
1957 
1958 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1959 			   unsigned long new_cl)
1960 {
1961 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1962 	struct tc_bind_class_args args = {};
1963 
1964 	if (!cops->tcf_block)
1965 		return;
1966 	args.portid = portid;
1967 	args.clid = clid;
1968 	args.new_cl = new_cl;
1969 	args.w.fn = tc_bind_class_walker;
1970 	q->ops->cl_ops->walk(q, &args.w);
1971 }
1972 
1973 #else
1974 
1975 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1976 			   unsigned long new_cl)
1977 {
1978 }
1979 
1980 #endif
1981 
1982 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1983 			 struct netlink_ext_ack *extack)
1984 {
1985 	struct net *net = sock_net(skb->sk);
1986 	struct tcmsg *tcm = nlmsg_data(n);
1987 	struct nlattr *tca[TCA_MAX + 1];
1988 	struct net_device *dev;
1989 	struct Qdisc *q = NULL;
1990 	const struct Qdisc_class_ops *cops;
1991 	unsigned long cl = 0;
1992 	unsigned long new_cl;
1993 	u32 portid;
1994 	u32 clid;
1995 	u32 qid;
1996 	int err;
1997 
1998 	if ((n->nlmsg_type != RTM_GETTCLASS) &&
1999 	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
2000 		return -EPERM;
2001 
2002 	err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2003 				     rtm_tca_policy, extack);
2004 	if (err < 0)
2005 		return err;
2006 
2007 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2008 	if (!dev)
2009 		return -ENODEV;
2010 
2011 	/*
2012 	   parent == TC_H_UNSPEC - unspecified parent.
2013 	   parent == TC_H_ROOT   - class is root, which has no parent.
2014 	   parent == X:0	 - parent is root class.
2015 	   parent == X:Y	 - parent is a node in hierarchy.
2016 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
2017 
2018 	   handle == 0:0	 - generate handle from kernel pool.
2019 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
2020 	   handle == X:Y	 - clear.
2021 	   handle == X:0	 - root class.
2022 	 */
2023 
2024 	/* Step 1. Determine qdisc handle X:0 */
2025 
2026 	portid = tcm->tcm_parent;
2027 	clid = tcm->tcm_handle;
2028 	qid = TC_H_MAJ(clid);
2029 
2030 	if (portid != TC_H_ROOT) {
2031 		u32 qid1 = TC_H_MAJ(portid);
2032 
2033 		if (qid && qid1) {
2034 			/* If both majors are known, they must be identical. */
2035 			if (qid != qid1)
2036 				return -EINVAL;
2037 		} else if (qid1) {
2038 			qid = qid1;
2039 		} else if (qid == 0)
2040 			qid = dev->qdisc->handle;
2041 
2042 		/* Now qid is genuine qdisc handle consistent
2043 		 * both with parent and child.
2044 		 *
2045 		 * TC_H_MAJ(portid) still may be unspecified, complete it now.
2046 		 */
2047 		if (portid)
2048 			portid = TC_H_MAKE(qid, portid);
2049 	} else {
2050 		if (qid == 0)
2051 			qid = dev->qdisc->handle;
2052 	}
2053 
2054 	/* OK. Locate qdisc */
2055 	q = qdisc_lookup(dev, qid);
2056 	if (!q)
2057 		return -ENOENT;
2058 
2059 	/* An check that it supports classes */
2060 	cops = q->ops->cl_ops;
2061 	if (cops == NULL)
2062 		return -EINVAL;
2063 
2064 	/* Now try to get class */
2065 	if (clid == 0) {
2066 		if (portid == TC_H_ROOT)
2067 			clid = qid;
2068 	} else
2069 		clid = TC_H_MAKE(qid, clid);
2070 
2071 	if (clid)
2072 		cl = cops->find(q, clid);
2073 
2074 	if (cl == 0) {
2075 		err = -ENOENT;
2076 		if (n->nlmsg_type != RTM_NEWTCLASS ||
2077 		    !(n->nlmsg_flags & NLM_F_CREATE))
2078 			goto out;
2079 	} else {
2080 		switch (n->nlmsg_type) {
2081 		case RTM_NEWTCLASS:
2082 			err = -EEXIST;
2083 			if (n->nlmsg_flags & NLM_F_EXCL)
2084 				goto out;
2085 			break;
2086 		case RTM_DELTCLASS:
2087 			err = tclass_del_notify(net, cops, skb, n, q, cl);
2088 			/* Unbind the class with flilters with 0 */
2089 			tc_bind_tclass(q, portid, clid, 0);
2090 			goto out;
2091 		case RTM_GETTCLASS:
2092 			err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2093 			goto out;
2094 		default:
2095 			err = -EINVAL;
2096 			goto out;
2097 		}
2098 	}
2099 
2100 	if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2101 		NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2102 		return -EOPNOTSUPP;
2103 	}
2104 
2105 	new_cl = cl;
2106 	err = -EOPNOTSUPP;
2107 	if (cops->change)
2108 		err = cops->change(q, clid, portid, tca, &new_cl, extack);
2109 	if (err == 0) {
2110 		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2111 		/* We just create a new class, need to do reverse binding. */
2112 		if (cl != new_cl)
2113 			tc_bind_tclass(q, portid, clid, new_cl);
2114 	}
2115 out:
2116 	return err;
2117 }
2118 
2119 struct qdisc_dump_args {
2120 	struct qdisc_walker	w;
2121 	struct sk_buff		*skb;
2122 	struct netlink_callback	*cb;
2123 };
2124 
2125 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2126 			    struct qdisc_walker *arg)
2127 {
2128 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2129 
2130 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2131 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2132 			      RTM_NEWTCLASS);
2133 }
2134 
2135 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2136 				struct tcmsg *tcm, struct netlink_callback *cb,
2137 				int *t_p, int s_t)
2138 {
2139 	struct qdisc_dump_args arg;
2140 
2141 	if (tc_qdisc_dump_ignore(q, false) ||
2142 	    *t_p < s_t || !q->ops->cl_ops ||
2143 	    (tcm->tcm_parent &&
2144 	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2145 		(*t_p)++;
2146 		return 0;
2147 	}
2148 	if (*t_p > s_t)
2149 		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2150 	arg.w.fn = qdisc_class_dump;
2151 	arg.skb = skb;
2152 	arg.cb = cb;
2153 	arg.w.stop  = 0;
2154 	arg.w.skip = cb->args[1];
2155 	arg.w.count = 0;
2156 	q->ops->cl_ops->walk(q, &arg.w);
2157 	cb->args[1] = arg.w.count;
2158 	if (arg.w.stop)
2159 		return -1;
2160 	(*t_p)++;
2161 	return 0;
2162 }
2163 
2164 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2165 			       struct tcmsg *tcm, struct netlink_callback *cb,
2166 			       int *t_p, int s_t)
2167 {
2168 	struct Qdisc *q;
2169 	int b;
2170 
2171 	if (!root)
2172 		return 0;
2173 
2174 	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2175 		return -1;
2176 
2177 	if (!qdisc_dev(root))
2178 		return 0;
2179 
2180 	if (tcm->tcm_parent) {
2181 		q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2182 		if (q && q != root &&
2183 		    tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2184 			return -1;
2185 		return 0;
2186 	}
2187 	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2188 		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2189 			return -1;
2190 	}
2191 
2192 	return 0;
2193 }
2194 
2195 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2196 {
2197 	struct tcmsg *tcm = nlmsg_data(cb->nlh);
2198 	struct net *net = sock_net(skb->sk);
2199 	struct netdev_queue *dev_queue;
2200 	struct net_device *dev;
2201 	int t, s_t;
2202 
2203 	if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2204 		return 0;
2205 	dev = dev_get_by_index(net, tcm->tcm_ifindex);
2206 	if (!dev)
2207 		return 0;
2208 
2209 	s_t = cb->args[0];
2210 	t = 0;
2211 
2212 	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
2213 		goto done;
2214 
2215 	dev_queue = dev_ingress_queue(dev);
2216 	if (dev_queue &&
2217 	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2218 				&t, s_t) < 0)
2219 		goto done;
2220 
2221 done:
2222 	cb->args[0] = t;
2223 
2224 	dev_put(dev);
2225 	return skb->len;
2226 }
2227 
2228 #ifdef CONFIG_PROC_FS
2229 static int psched_show(struct seq_file *seq, void *v)
2230 {
2231 	seq_printf(seq, "%08x %08x %08x %08x\n",
2232 		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2233 		   1000000,
2234 		   (u32)NSEC_PER_SEC / hrtimer_resolution);
2235 
2236 	return 0;
2237 }
2238 
2239 static int __net_init psched_net_init(struct net *net)
2240 {
2241 	struct proc_dir_entry *e;
2242 
2243 	e = proc_create_single("psched", 0, net->proc_net, psched_show);
2244 	if (e == NULL)
2245 		return -ENOMEM;
2246 
2247 	return 0;
2248 }
2249 
2250 static void __net_exit psched_net_exit(struct net *net)
2251 {
2252 	remove_proc_entry("psched", net->proc_net);
2253 }
2254 #else
2255 static int __net_init psched_net_init(struct net *net)
2256 {
2257 	return 0;
2258 }
2259 
2260 static void __net_exit psched_net_exit(struct net *net)
2261 {
2262 }
2263 #endif
2264 
2265 static struct pernet_operations psched_net_ops = {
2266 	.init = psched_net_init,
2267 	.exit = psched_net_exit,
2268 };
2269 
2270 static int __init pktsched_init(void)
2271 {
2272 	int err;
2273 
2274 	err = register_pernet_subsys(&psched_net_ops);
2275 	if (err) {
2276 		pr_err("pktsched_init: "
2277 		       "cannot initialize per netns operations\n");
2278 		return err;
2279 	}
2280 
2281 	register_qdisc(&pfifo_fast_ops);
2282 	register_qdisc(&pfifo_qdisc_ops);
2283 	register_qdisc(&bfifo_qdisc_ops);
2284 	register_qdisc(&pfifo_head_drop_qdisc_ops);
2285 	register_qdisc(&mq_qdisc_ops);
2286 	register_qdisc(&noqueue_qdisc_ops);
2287 
2288 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2289 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2290 	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2291 		      0);
2292 	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2293 	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2294 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2295 		      0);
2296 
2297 	return 0;
2298 }
2299 
2300 subsys_initcall(pktsched_init);
2301