xref: /linux/net/sched/sch_api.c (revision fb1ceb29b27cda91af35851ebab01f298d82162e)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_api.c	Packet scheduler API.
4  *
5  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  *
7  * Fixes:
8  *
9  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
12  */
13 
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
28 
29 #include <net/net_namespace.h>
30 #include <net/sock.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
34 #include <net/tc_wrapper.h>
35 
36 #include <trace/events/qdisc.h>
37 
38 /*
39 
40    Short review.
41    -------------
42 
43    This file consists of two interrelated parts:
44 
45    1. queueing disciplines manager frontend.
46    2. traffic classes manager frontend.
47 
48    Generally, queueing discipline ("qdisc") is a black box,
49    which is able to enqueue packets and to dequeue them (when
50    device is ready to send something) in order and at times
51    determined by algorithm hidden in it.
52 
53    qdisc's are divided to two categories:
54    - "queues", which have no internal structure visible from outside.
55    - "schedulers", which split all the packets to "traffic classes",
56      using "packet classifiers" (look at cls_api.c)
57 
58    In turn, classes may have child qdiscs (as rule, queues)
59    attached to them etc. etc. etc.
60 
61    The goal of the routines in this file is to translate
62    information supplied by user in the form of handles
63    to more intelligible for kernel form, to make some sanity
64    checks and part of work, which is common to all qdiscs
65    and to provide rtnetlink notifications.
66 
67    All real intelligent work is done inside qdisc modules.
68 
69 
70 
71    Every discipline has two major routines: enqueue and dequeue.
72 
73    ---dequeue
74 
75    dequeue usually returns a skb to send. It is allowed to return NULL,
76    but it does not mean that queue is empty, it just means that
77    discipline does not want to send anything this time.
78    Queue is really empty if q->q.qlen == 0.
79    For complicated disciplines with multiple queues q->q is not
80    real packet queue, but however q->q.qlen must be valid.
81 
82    ---enqueue
83 
84    enqueue returns 0, if packet was enqueued successfully.
85    If packet (this one or another one) was dropped, it returns
86    not zero error code.
87    NET_XMIT_DROP 	- this packet dropped
88      Expected action: do not backoff, but wait until queue will clear.
89    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
90      Expected action: backoff or ignore
91 
92    Auxiliary routines:
93 
94    ---peek
95 
96    like dequeue but without removing a packet from the queue
97 
98    ---reset
99 
100    returns qdisc to initial state: purge all buffers, clear all
101    timers, counters (except for statistics) etc.
102 
103    ---init
104 
105    initializes newly created qdisc.
106 
107    ---destroy
108 
109    destroys resources allocated by init and during lifetime of qdisc.
110 
111    ---change
112 
113    changes qdisc parameters.
114  */
115 
116 /* Protects list of registered TC modules. It is pure SMP lock. */
117 static DEFINE_RWLOCK(qdisc_mod_lock);
118 
119 
120 /************************************************
121  *	Queueing disciplines manipulation.	*
122  ************************************************/
123 
124 
125 /* The list of all installed queueing disciplines. */
126 
127 static struct Qdisc_ops *qdisc_base;
128 
129 /* Register/unregister queueing discipline */
130 
131 int register_qdisc(struct Qdisc_ops *qops)
132 {
133 	struct Qdisc_ops *q, **qp;
134 	int rc = -EEXIST;
135 
136 	write_lock(&qdisc_mod_lock);
137 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
138 		if (!strcmp(qops->id, q->id))
139 			goto out;
140 
141 	if (qops->enqueue == NULL)
142 		qops->enqueue = noop_qdisc_ops.enqueue;
143 	if (qops->peek == NULL) {
144 		if (qops->dequeue == NULL)
145 			qops->peek = noop_qdisc_ops.peek;
146 		else
147 			goto out_einval;
148 	}
149 	if (qops->dequeue == NULL)
150 		qops->dequeue = noop_qdisc_ops.dequeue;
151 
152 	if (qops->cl_ops) {
153 		const struct Qdisc_class_ops *cops = qops->cl_ops;
154 
155 		if (!(cops->find && cops->walk && cops->leaf))
156 			goto out_einval;
157 
158 		if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
159 			goto out_einval;
160 	}
161 
162 	qops->next = NULL;
163 	*qp = qops;
164 	rc = 0;
165 out:
166 	write_unlock(&qdisc_mod_lock);
167 	return rc;
168 
169 out_einval:
170 	rc = -EINVAL;
171 	goto out;
172 }
173 EXPORT_SYMBOL(register_qdisc);
174 
175 void unregister_qdisc(struct Qdisc_ops *qops)
176 {
177 	struct Qdisc_ops *q, **qp;
178 	int err = -ENOENT;
179 
180 	write_lock(&qdisc_mod_lock);
181 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
182 		if (q == qops)
183 			break;
184 	if (q) {
185 		*qp = q->next;
186 		q->next = NULL;
187 		err = 0;
188 	}
189 	write_unlock(&qdisc_mod_lock);
190 
191 	WARN(err, "unregister qdisc(%s) failed\n", qops->id);
192 }
193 EXPORT_SYMBOL(unregister_qdisc);
194 
195 /* Get default qdisc if not otherwise specified */
196 void qdisc_get_default(char *name, size_t len)
197 {
198 	read_lock(&qdisc_mod_lock);
199 	strscpy(name, default_qdisc_ops->id, len);
200 	read_unlock(&qdisc_mod_lock);
201 }
202 
203 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
204 {
205 	struct Qdisc_ops *q = NULL;
206 
207 	for (q = qdisc_base; q; q = q->next) {
208 		if (!strcmp(name, q->id)) {
209 			if (!try_module_get(q->owner))
210 				q = NULL;
211 			break;
212 		}
213 	}
214 
215 	return q;
216 }
217 
218 /* Set new default qdisc to use */
219 int qdisc_set_default(const char *name)
220 {
221 	const struct Qdisc_ops *ops;
222 
223 	if (!capable(CAP_NET_ADMIN))
224 		return -EPERM;
225 
226 	write_lock(&qdisc_mod_lock);
227 	ops = qdisc_lookup_default(name);
228 	if (!ops) {
229 		/* Not found, drop lock and try to load module */
230 		write_unlock(&qdisc_mod_lock);
231 		request_module(NET_SCH_ALIAS_PREFIX "%s", name);
232 		write_lock(&qdisc_mod_lock);
233 
234 		ops = qdisc_lookup_default(name);
235 	}
236 
237 	if (ops) {
238 		/* Set new default */
239 		module_put(default_qdisc_ops->owner);
240 		default_qdisc_ops = ops;
241 	}
242 	write_unlock(&qdisc_mod_lock);
243 
244 	return ops ? 0 : -ENOENT;
245 }
246 
247 #ifdef CONFIG_NET_SCH_DEFAULT
248 /* Set default value from kernel config */
249 static int __init sch_default_qdisc(void)
250 {
251 	return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
252 }
253 late_initcall(sch_default_qdisc);
254 #endif
255 
256 /* We know handle. Find qdisc among all qdisc's attached to device
257  * (root qdisc, all its children, children of children etc.)
258  * Note: caller either uses rtnl or rcu_read_lock()
259  */
260 
261 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
262 {
263 	struct Qdisc *q;
264 
265 	if (!qdisc_dev(root))
266 		return (root->handle == handle ? root : NULL);
267 
268 	if (!(root->flags & TCQ_F_BUILTIN) &&
269 	    root->handle == handle)
270 		return root;
271 
272 	hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
273 				   lockdep_rtnl_is_held()) {
274 		if (q->handle == handle)
275 			return q;
276 	}
277 	return NULL;
278 }
279 
280 void qdisc_hash_add(struct Qdisc *q, bool invisible)
281 {
282 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
283 		ASSERT_RTNL();
284 		hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
285 		if (invisible)
286 			q->flags |= TCQ_F_INVISIBLE;
287 	}
288 }
289 EXPORT_SYMBOL(qdisc_hash_add);
290 
291 void qdisc_hash_del(struct Qdisc *q)
292 {
293 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
294 		ASSERT_RTNL();
295 		hash_del_rcu(&q->hash);
296 	}
297 }
298 EXPORT_SYMBOL(qdisc_hash_del);
299 
300 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
301 {
302 	struct Qdisc *q;
303 
304 	if (!handle)
305 		return NULL;
306 	q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle);
307 	if (q)
308 		goto out;
309 
310 	if (dev_ingress_queue(dev))
311 		q = qdisc_match_from_root(
312 			rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping),
313 			handle);
314 out:
315 	return q;
316 }
317 
318 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
319 {
320 	struct netdev_queue *nq;
321 	struct Qdisc *q;
322 
323 	if (!handle)
324 		return NULL;
325 	q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle);
326 	if (q)
327 		goto out;
328 
329 	nq = dev_ingress_queue_rcu(dev);
330 	if (nq)
331 		q = qdisc_match_from_root(rcu_dereference(nq->qdisc_sleeping),
332 					  handle);
333 out:
334 	return q;
335 }
336 
337 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
338 {
339 	unsigned long cl;
340 	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
341 
342 	if (cops == NULL)
343 		return NULL;
344 	cl = cops->find(p, classid);
345 
346 	if (cl == 0)
347 		return NULL;
348 	return cops->leaf(p, cl);
349 }
350 
351 /* Find queueing discipline by name */
352 
353 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
354 {
355 	struct Qdisc_ops *q = NULL;
356 
357 	if (kind) {
358 		read_lock(&qdisc_mod_lock);
359 		for (q = qdisc_base; q; q = q->next) {
360 			if (nla_strcmp(kind, q->id) == 0) {
361 				if (!try_module_get(q->owner))
362 					q = NULL;
363 				break;
364 			}
365 		}
366 		read_unlock(&qdisc_mod_lock);
367 	}
368 	return q;
369 }
370 
371 /* The linklayer setting were not transferred from iproute2, in older
372  * versions, and the rate tables lookup systems have been dropped in
373  * the kernel. To keep backward compatible with older iproute2 tc
374  * utils, we detect the linklayer setting by detecting if the rate
375  * table were modified.
376  *
377  * For linklayer ATM table entries, the rate table will be aligned to
378  * 48 bytes, thus some table entries will contain the same value.  The
379  * mpu (min packet unit) is also encoded into the old rate table, thus
380  * starting from the mpu, we find low and high table entries for
381  * mapping this cell.  If these entries contain the same value, when
382  * the rate tables have been modified for linklayer ATM.
383  *
384  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
385  * and then roundup to the next cell, calc the table entry one below,
386  * and compare.
387  */
388 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
389 {
390 	int low       = roundup(r->mpu, 48);
391 	int high      = roundup(low+1, 48);
392 	int cell_low  = low >> r->cell_log;
393 	int cell_high = (high >> r->cell_log) - 1;
394 
395 	/* rtab is too inaccurate at rates > 100Mbit/s */
396 	if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
397 		pr_debug("TC linklayer: Giving up ATM detection\n");
398 		return TC_LINKLAYER_ETHERNET;
399 	}
400 
401 	if ((cell_high > cell_low) && (cell_high < 256)
402 	    && (rtab[cell_low] == rtab[cell_high])) {
403 		pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
404 			 cell_low, cell_high, rtab[cell_high]);
405 		return TC_LINKLAYER_ATM;
406 	}
407 	return TC_LINKLAYER_ETHERNET;
408 }
409 
410 static struct qdisc_rate_table *qdisc_rtab_list;
411 
412 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
413 					struct nlattr *tab,
414 					struct netlink_ext_ack *extack)
415 {
416 	struct qdisc_rate_table *rtab;
417 
418 	if (tab == NULL || r->rate == 0 ||
419 	    r->cell_log == 0 || r->cell_log >= 32 ||
420 	    nla_len(tab) != TC_RTAB_SIZE) {
421 		NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
422 		return NULL;
423 	}
424 
425 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
426 		if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
427 		    !memcmp(&rtab->data, nla_data(tab), 1024)) {
428 			rtab->refcnt++;
429 			return rtab;
430 		}
431 	}
432 
433 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
434 	if (rtab) {
435 		rtab->rate = *r;
436 		rtab->refcnt = 1;
437 		memcpy(rtab->data, nla_data(tab), 1024);
438 		if (r->linklayer == TC_LINKLAYER_UNAWARE)
439 			r->linklayer = __detect_linklayer(r, rtab->data);
440 		rtab->next = qdisc_rtab_list;
441 		qdisc_rtab_list = rtab;
442 	} else {
443 		NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
444 	}
445 	return rtab;
446 }
447 EXPORT_SYMBOL(qdisc_get_rtab);
448 
449 void qdisc_put_rtab(struct qdisc_rate_table *tab)
450 {
451 	struct qdisc_rate_table *rtab, **rtabp;
452 
453 	if (!tab || --tab->refcnt)
454 		return;
455 
456 	for (rtabp = &qdisc_rtab_list;
457 	     (rtab = *rtabp) != NULL;
458 	     rtabp = &rtab->next) {
459 		if (rtab == tab) {
460 			*rtabp = rtab->next;
461 			kfree(rtab);
462 			return;
463 		}
464 	}
465 }
466 EXPORT_SYMBOL(qdisc_put_rtab);
467 
468 static LIST_HEAD(qdisc_stab_list);
469 
470 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
471 	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
472 	[TCA_STAB_DATA] = { .type = NLA_BINARY },
473 };
474 
475 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
476 					       struct netlink_ext_ack *extack)
477 {
478 	struct nlattr *tb[TCA_STAB_MAX + 1];
479 	struct qdisc_size_table *stab;
480 	struct tc_sizespec *s;
481 	unsigned int tsize = 0;
482 	u16 *tab = NULL;
483 	int err;
484 
485 	err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
486 					  extack);
487 	if (err < 0)
488 		return ERR_PTR(err);
489 	if (!tb[TCA_STAB_BASE]) {
490 		NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
491 		return ERR_PTR(-EINVAL);
492 	}
493 
494 	s = nla_data(tb[TCA_STAB_BASE]);
495 
496 	if (s->tsize > 0) {
497 		if (!tb[TCA_STAB_DATA]) {
498 			NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
499 			return ERR_PTR(-EINVAL);
500 		}
501 		tab = nla_data(tb[TCA_STAB_DATA]);
502 		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
503 	}
504 
505 	if (tsize != s->tsize || (!tab && tsize > 0)) {
506 		NL_SET_ERR_MSG(extack, "Invalid size of size table");
507 		return ERR_PTR(-EINVAL);
508 	}
509 
510 	list_for_each_entry(stab, &qdisc_stab_list, list) {
511 		if (memcmp(&stab->szopts, s, sizeof(*s)))
512 			continue;
513 		if (tsize > 0 &&
514 		    memcmp(stab->data, tab, flex_array_size(stab, data, tsize)))
515 			continue;
516 		stab->refcnt++;
517 		return stab;
518 	}
519 
520 	if (s->size_log > STAB_SIZE_LOG_MAX ||
521 	    s->cell_log > STAB_SIZE_LOG_MAX) {
522 		NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
523 		return ERR_PTR(-EINVAL);
524 	}
525 
526 	stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL);
527 	if (!stab)
528 		return ERR_PTR(-ENOMEM);
529 
530 	stab->refcnt = 1;
531 	stab->szopts = *s;
532 	if (tsize > 0)
533 		memcpy(stab->data, tab, flex_array_size(stab, data, tsize));
534 
535 	list_add_tail(&stab->list, &qdisc_stab_list);
536 
537 	return stab;
538 }
539 
540 void qdisc_put_stab(struct qdisc_size_table *tab)
541 {
542 	if (!tab)
543 		return;
544 
545 	if (--tab->refcnt == 0) {
546 		list_del(&tab->list);
547 		kfree_rcu(tab, rcu);
548 	}
549 }
550 EXPORT_SYMBOL(qdisc_put_stab);
551 
552 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
553 {
554 	struct nlattr *nest;
555 
556 	nest = nla_nest_start_noflag(skb, TCA_STAB);
557 	if (nest == NULL)
558 		goto nla_put_failure;
559 	if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
560 		goto nla_put_failure;
561 	nla_nest_end(skb, nest);
562 
563 	return skb->len;
564 
565 nla_put_failure:
566 	return -1;
567 }
568 
569 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
570 			       const struct qdisc_size_table *stab)
571 {
572 	int pkt_len, slot;
573 
574 	pkt_len = skb->len + stab->szopts.overhead;
575 	if (unlikely(!stab->szopts.tsize))
576 		goto out;
577 
578 	slot = pkt_len + stab->szopts.cell_align;
579 	if (unlikely(slot < 0))
580 		slot = 0;
581 
582 	slot >>= stab->szopts.cell_log;
583 	if (likely(slot < stab->szopts.tsize))
584 		pkt_len = stab->data[slot];
585 	else
586 		pkt_len = stab->data[stab->szopts.tsize - 1] *
587 				(slot / stab->szopts.tsize) +
588 				stab->data[slot % stab->szopts.tsize];
589 
590 	pkt_len <<= stab->szopts.size_log;
591 out:
592 	if (unlikely(pkt_len < 1))
593 		pkt_len = 1;
594 	qdisc_skb_cb(skb)->pkt_len = pkt_len;
595 }
596 
597 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
598 {
599 	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
600 		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
601 			txt, qdisc->ops->id, qdisc->handle >> 16);
602 		qdisc->flags |= TCQ_F_WARN_NONWC;
603 	}
604 }
605 EXPORT_SYMBOL(qdisc_warn_nonwc);
606 
607 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
608 {
609 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
610 						 timer);
611 
612 	rcu_read_lock();
613 	__netif_schedule(qdisc_root(wd->qdisc));
614 	rcu_read_unlock();
615 
616 	return HRTIMER_NORESTART;
617 }
618 
619 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
620 				 clockid_t clockid)
621 {
622 	hrtimer_setup(&wd->timer, qdisc_watchdog, clockid, HRTIMER_MODE_ABS_PINNED);
623 	wd->qdisc = qdisc;
624 }
625 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
626 
627 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
628 {
629 	qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
630 }
631 EXPORT_SYMBOL(qdisc_watchdog_init);
632 
633 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
634 				      u64 delta_ns)
635 {
636 	bool deactivated;
637 
638 	rcu_read_lock();
639 	deactivated = test_bit(__QDISC_STATE_DEACTIVATED,
640 			       &qdisc_root_sleeping(wd->qdisc)->state);
641 	rcu_read_unlock();
642 	if (deactivated)
643 		return;
644 
645 	if (hrtimer_is_queued(&wd->timer)) {
646 		u64 softexpires;
647 
648 		softexpires = ktime_to_ns(hrtimer_get_softexpires(&wd->timer));
649 		/* If timer is already set in [expires, expires + delta_ns],
650 		 * do not reprogram it.
651 		 */
652 		if (softexpires - expires <= delta_ns)
653 			return;
654 	}
655 
656 	hrtimer_start_range_ns(&wd->timer,
657 			       ns_to_ktime(expires),
658 			       delta_ns,
659 			       HRTIMER_MODE_ABS_PINNED);
660 }
661 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
662 
663 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
664 {
665 	hrtimer_cancel(&wd->timer);
666 }
667 EXPORT_SYMBOL(qdisc_watchdog_cancel);
668 
669 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
670 {
671 	struct hlist_head *h;
672 	unsigned int i;
673 
674 	h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
675 
676 	if (h != NULL) {
677 		for (i = 0; i < n; i++)
678 			INIT_HLIST_HEAD(&h[i]);
679 	}
680 	return h;
681 }
682 
683 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
684 {
685 	struct Qdisc_class_common *cl;
686 	struct hlist_node *next;
687 	struct hlist_head *nhash, *ohash;
688 	unsigned int nsize, nmask, osize;
689 	unsigned int i, h;
690 
691 	/* Rehash when load factor exceeds 0.75 */
692 	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
693 		return;
694 	nsize = clhash->hashsize * 2;
695 	nmask = nsize - 1;
696 	nhash = qdisc_class_hash_alloc(nsize);
697 	if (nhash == NULL)
698 		return;
699 
700 	ohash = clhash->hash;
701 	osize = clhash->hashsize;
702 
703 	sch_tree_lock(sch);
704 	for (i = 0; i < osize; i++) {
705 		hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
706 			h = qdisc_class_hash(cl->classid, nmask);
707 			hlist_add_head(&cl->hnode, &nhash[h]);
708 		}
709 	}
710 	clhash->hash     = nhash;
711 	clhash->hashsize = nsize;
712 	clhash->hashmask = nmask;
713 	sch_tree_unlock(sch);
714 
715 	kvfree(ohash);
716 }
717 EXPORT_SYMBOL(qdisc_class_hash_grow);
718 
719 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
720 {
721 	unsigned int size = 4;
722 
723 	clhash->hash = qdisc_class_hash_alloc(size);
724 	if (!clhash->hash)
725 		return -ENOMEM;
726 	clhash->hashsize  = size;
727 	clhash->hashmask  = size - 1;
728 	clhash->hashelems = 0;
729 	return 0;
730 }
731 EXPORT_SYMBOL(qdisc_class_hash_init);
732 
733 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
734 {
735 	kvfree(clhash->hash);
736 }
737 EXPORT_SYMBOL(qdisc_class_hash_destroy);
738 
739 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
740 			     struct Qdisc_class_common *cl)
741 {
742 	unsigned int h;
743 
744 	INIT_HLIST_NODE(&cl->hnode);
745 	h = qdisc_class_hash(cl->classid, clhash->hashmask);
746 	hlist_add_head(&cl->hnode, &clhash->hash[h]);
747 	clhash->hashelems++;
748 }
749 EXPORT_SYMBOL(qdisc_class_hash_insert);
750 
751 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
752 			     struct Qdisc_class_common *cl)
753 {
754 	hlist_del(&cl->hnode);
755 	clhash->hashelems--;
756 }
757 EXPORT_SYMBOL(qdisc_class_hash_remove);
758 
759 /* Allocate an unique handle from space managed by kernel
760  * Possible range is [8000-FFFF]:0000 (0x8000 values)
761  */
762 static u32 qdisc_alloc_handle(struct net_device *dev)
763 {
764 	int i = 0x8000;
765 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
766 
767 	do {
768 		autohandle += TC_H_MAKE(0x10000U, 0);
769 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
770 			autohandle = TC_H_MAKE(0x80000000U, 0);
771 		if (!qdisc_lookup(dev, autohandle))
772 			return autohandle;
773 		cond_resched();
774 	} while	(--i > 0);
775 
776 	return 0;
777 }
778 
779 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
780 {
781 	bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
782 	const struct Qdisc_class_ops *cops;
783 	unsigned long cl;
784 	u32 parentid;
785 	bool notify;
786 	int drops;
787 
788 	if (n == 0 && len == 0)
789 		return;
790 	drops = max_t(int, n, 0);
791 	rcu_read_lock();
792 	while ((parentid = sch->parent)) {
793 		if (parentid == TC_H_ROOT)
794 			break;
795 
796 		if (sch->flags & TCQ_F_NOPARENT)
797 			break;
798 		/* Notify parent qdisc only if child qdisc becomes empty.
799 		 *
800 		 * If child was empty even before update then backlog
801 		 * counter is screwed and we skip notification because
802 		 * parent class is already passive.
803 		 *
804 		 * If the original child was offloaded then it is allowed
805 		 * to be seem as empty, so the parent is notified anyway.
806 		 */
807 		notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
808 						       !qdisc_is_offloaded);
809 		/* TODO: perform the search on a per txq basis */
810 		sch = qdisc_lookup_rcu(qdisc_dev(sch), TC_H_MAJ(parentid));
811 		if (sch == NULL) {
812 			WARN_ON_ONCE(parentid != TC_H_ROOT);
813 			break;
814 		}
815 		cops = sch->ops->cl_ops;
816 		if (notify && cops->qlen_notify) {
817 			cl = cops->find(sch, parentid);
818 			cops->qlen_notify(sch, cl);
819 		}
820 		sch->q.qlen -= n;
821 		sch->qstats.backlog -= len;
822 		__qdisc_qstats_drop(sch, drops);
823 	}
824 	rcu_read_unlock();
825 }
826 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
827 
828 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
829 			      void *type_data)
830 {
831 	struct net_device *dev = qdisc_dev(sch);
832 	int err;
833 
834 	sch->flags &= ~TCQ_F_OFFLOADED;
835 	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
836 		return 0;
837 
838 	err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
839 	if (err == -EOPNOTSUPP)
840 		return 0;
841 
842 	if (!err)
843 		sch->flags |= TCQ_F_OFFLOADED;
844 
845 	return err;
846 }
847 EXPORT_SYMBOL(qdisc_offload_dump_helper);
848 
849 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
850 				struct Qdisc *new, struct Qdisc *old,
851 				enum tc_setup_type type, void *type_data,
852 				struct netlink_ext_ack *extack)
853 {
854 	bool any_qdisc_is_offloaded;
855 	int err;
856 
857 	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
858 		return;
859 
860 	err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
861 
862 	/* Don't report error if the graft is part of destroy operation. */
863 	if (!err || !new || new == &noop_qdisc)
864 		return;
865 
866 	/* Don't report error if the parent, the old child and the new
867 	 * one are not offloaded.
868 	 */
869 	any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
870 	any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
871 	any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
872 
873 	if (any_qdisc_is_offloaded)
874 		NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
875 }
876 EXPORT_SYMBOL(qdisc_offload_graft_helper);
877 
878 void qdisc_offload_query_caps(struct net_device *dev,
879 			      enum tc_setup_type type,
880 			      void *caps, size_t caps_len)
881 {
882 	const struct net_device_ops *ops = dev->netdev_ops;
883 	struct tc_query_caps_base base = {
884 		.type = type,
885 		.caps = caps,
886 	};
887 
888 	memset(caps, 0, caps_len);
889 
890 	if (ops->ndo_setup_tc)
891 		ops->ndo_setup_tc(dev, TC_QUERY_CAPS, &base);
892 }
893 EXPORT_SYMBOL(qdisc_offload_query_caps);
894 
895 static void qdisc_offload_graft_root(struct net_device *dev,
896 				     struct Qdisc *new, struct Qdisc *old,
897 				     struct netlink_ext_ack *extack)
898 {
899 	struct tc_root_qopt_offload graft_offload = {
900 		.command	= TC_ROOT_GRAFT,
901 		.handle		= new ? new->handle : 0,
902 		.ingress	= (new && new->flags & TCQ_F_INGRESS) ||
903 				  (old && old->flags & TCQ_F_INGRESS),
904 	};
905 
906 	qdisc_offload_graft_helper(dev, NULL, new, old,
907 				   TC_SETUP_ROOT_QDISC, &graft_offload, extack);
908 }
909 
910 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
911 			 u32 portid, u32 seq, u16 flags, int event,
912 			 struct netlink_ext_ack *extack)
913 {
914 	struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL;
915 	struct gnet_stats_queue __percpu *cpu_qstats = NULL;
916 	struct tcmsg *tcm;
917 	struct nlmsghdr  *nlh;
918 	unsigned char *b = skb_tail_pointer(skb);
919 	struct gnet_dump d;
920 	struct qdisc_size_table *stab;
921 	u32 block_index;
922 	__u32 qlen;
923 
924 	cond_resched();
925 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
926 	if (!nlh)
927 		goto out_nlmsg_trim;
928 	tcm = nlmsg_data(nlh);
929 	tcm->tcm_family = AF_UNSPEC;
930 	tcm->tcm__pad1 = 0;
931 	tcm->tcm__pad2 = 0;
932 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
933 	tcm->tcm_parent = clid;
934 	tcm->tcm_handle = q->handle;
935 	tcm->tcm_info = refcount_read(&q->refcnt);
936 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
937 		goto nla_put_failure;
938 	if (q->ops->ingress_block_get) {
939 		block_index = q->ops->ingress_block_get(q);
940 		if (block_index &&
941 		    nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
942 			goto nla_put_failure;
943 	}
944 	if (q->ops->egress_block_get) {
945 		block_index = q->ops->egress_block_get(q);
946 		if (block_index &&
947 		    nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
948 			goto nla_put_failure;
949 	}
950 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
951 		goto nla_put_failure;
952 	if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
953 		goto nla_put_failure;
954 	qlen = qdisc_qlen_sum(q);
955 
956 	stab = rtnl_dereference(q->stab);
957 	if (stab && qdisc_dump_stab(skb, stab) < 0)
958 		goto nla_put_failure;
959 
960 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
961 					 NULL, &d, TCA_PAD) < 0)
962 		goto nla_put_failure;
963 
964 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
965 		goto nla_put_failure;
966 
967 	if (qdisc_is_percpu_stats(q)) {
968 		cpu_bstats = q->cpu_bstats;
969 		cpu_qstats = q->cpu_qstats;
970 	}
971 
972 	if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 ||
973 	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
974 	    gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
975 		goto nla_put_failure;
976 
977 	if (gnet_stats_finish_copy(&d) < 0)
978 		goto nla_put_failure;
979 
980 	if (extack && extack->_msg &&
981 	    nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
982 		goto out_nlmsg_trim;
983 
984 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
985 
986 	return skb->len;
987 
988 out_nlmsg_trim:
989 nla_put_failure:
990 	nlmsg_trim(skb, b);
991 	return -1;
992 }
993 
994 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
995 {
996 	if (q->flags & TCQ_F_BUILTIN)
997 		return true;
998 	if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
999 		return true;
1000 
1001 	return false;
1002 }
1003 
1004 static int qdisc_get_notify(struct net *net, struct sk_buff *oskb,
1005 			    struct nlmsghdr *n, u32 clid, struct Qdisc *q,
1006 			    struct netlink_ext_ack *extack)
1007 {
1008 	struct sk_buff *skb;
1009 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1010 
1011 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1012 	if (!skb)
1013 		return -ENOBUFS;
1014 
1015 	if (!tc_qdisc_dump_ignore(q, false)) {
1016 		if (tc_fill_qdisc(skb, q, clid, portid, n->nlmsg_seq, 0,
1017 				  RTM_NEWQDISC, extack) < 0)
1018 			goto err_out;
1019 	}
1020 
1021 	if (skb->len)
1022 		return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1023 				      n->nlmsg_flags & NLM_F_ECHO);
1024 
1025 err_out:
1026 	kfree_skb(skb);
1027 	return -EINVAL;
1028 }
1029 
1030 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1031 			struct nlmsghdr *n, u32 clid,
1032 			struct Qdisc *old, struct Qdisc *new,
1033 			struct netlink_ext_ack *extack)
1034 {
1035 	struct sk_buff *skb;
1036 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1037 
1038 	if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC))
1039 		return 0;
1040 
1041 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1042 	if (!skb)
1043 		return -ENOBUFS;
1044 
1045 	if (old && !tc_qdisc_dump_ignore(old, false)) {
1046 		if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1047 				  0, RTM_DELQDISC, extack) < 0)
1048 			goto err_out;
1049 	}
1050 	if (new && !tc_qdisc_dump_ignore(new, false)) {
1051 		if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1052 				  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC, extack) < 0)
1053 			goto err_out;
1054 	}
1055 
1056 	if (skb->len)
1057 		return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1058 				      n->nlmsg_flags & NLM_F_ECHO);
1059 
1060 err_out:
1061 	kfree_skb(skb);
1062 	return -EINVAL;
1063 }
1064 
1065 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
1066 			       struct nlmsghdr *n, u32 clid,
1067 			       struct Qdisc *old, struct Qdisc *new,
1068 			       struct netlink_ext_ack *extack)
1069 {
1070 	if (new || old)
1071 		qdisc_notify(net, skb, n, clid, old, new, extack);
1072 
1073 	if (old)
1074 		qdisc_put(old);
1075 }
1076 
1077 static void qdisc_clear_nolock(struct Qdisc *sch)
1078 {
1079 	sch->flags &= ~TCQ_F_NOLOCK;
1080 	if (!(sch->flags & TCQ_F_CPUSTATS))
1081 		return;
1082 
1083 	free_percpu(sch->cpu_bstats);
1084 	free_percpu(sch->cpu_qstats);
1085 	sch->cpu_bstats = NULL;
1086 	sch->cpu_qstats = NULL;
1087 	sch->flags &= ~TCQ_F_CPUSTATS;
1088 }
1089 
1090 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1091  * to device "dev".
1092  *
1093  * When appropriate send a netlink notification using 'skb'
1094  * and "n".
1095  *
1096  * On success, destroy old qdisc.
1097  */
1098 
1099 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1100 		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1101 		       struct Qdisc *new, struct Qdisc *old,
1102 		       struct netlink_ext_ack *extack)
1103 {
1104 	struct Qdisc *q = old;
1105 	struct net *net = dev_net(dev);
1106 
1107 	if (parent == NULL) {
1108 		unsigned int i, num_q, ingress;
1109 		struct netdev_queue *dev_queue;
1110 
1111 		ingress = 0;
1112 		num_q = dev->num_tx_queues;
1113 		if ((q && q->flags & TCQ_F_INGRESS) ||
1114 		    (new && new->flags & TCQ_F_INGRESS)) {
1115 			ingress = 1;
1116 			dev_queue = dev_ingress_queue(dev);
1117 			if (!dev_queue) {
1118 				NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1119 				return -ENOENT;
1120 			}
1121 
1122 			q = rtnl_dereference(dev_queue->qdisc_sleeping);
1123 
1124 			/* This is the counterpart of that qdisc_refcount_inc_nz() call in
1125 			 * __tcf_qdisc_find() for filter requests.
1126 			 */
1127 			if (!qdisc_refcount_dec_if_one(q)) {
1128 				NL_SET_ERR_MSG(extack,
1129 					       "Current ingress or clsact Qdisc has ongoing filter requests");
1130 				return -EBUSY;
1131 			}
1132 		}
1133 
1134 		if (dev->flags & IFF_UP)
1135 			dev_deactivate(dev);
1136 
1137 		qdisc_offload_graft_root(dev, new, old, extack);
1138 
1139 		if (new && new->ops->attach && !ingress)
1140 			goto skip;
1141 
1142 		if (!ingress) {
1143 			for (i = 0; i < num_q; i++) {
1144 				dev_queue = netdev_get_tx_queue(dev, i);
1145 				old = dev_graft_qdisc(dev_queue, new);
1146 
1147 				if (new && i > 0)
1148 					qdisc_refcount_inc(new);
1149 				qdisc_put(old);
1150 			}
1151 		} else {
1152 			old = dev_graft_qdisc(dev_queue, NULL);
1153 
1154 			/* {ingress,clsact}_destroy() @old before grafting @new to avoid
1155 			 * unprotected concurrent accesses to net_device::miniq_{in,e}gress
1156 			 * pointer(s) in mini_qdisc_pair_swap().
1157 			 */
1158 			qdisc_notify(net, skb, n, classid, old, new, extack);
1159 			qdisc_destroy(old);
1160 
1161 			dev_graft_qdisc(dev_queue, new);
1162 		}
1163 
1164 skip:
1165 		if (!ingress) {
1166 			old = rtnl_dereference(dev->qdisc);
1167 			if (new && !new->ops->attach)
1168 				qdisc_refcount_inc(new);
1169 			rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
1170 
1171 			notify_and_destroy(net, skb, n, classid, old, new, extack);
1172 
1173 			if (new && new->ops->attach)
1174 				new->ops->attach(new);
1175 		}
1176 
1177 		if (dev->flags & IFF_UP)
1178 			dev_activate(dev);
1179 	} else {
1180 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1181 		unsigned long cl;
1182 		int err;
1183 
1184 		/* Only support running class lockless if parent is lockless */
1185 		if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1186 			qdisc_clear_nolock(new);
1187 
1188 		if (!cops || !cops->graft)
1189 			return -EOPNOTSUPP;
1190 
1191 		cl = cops->find(parent, classid);
1192 		if (!cl) {
1193 			NL_SET_ERR_MSG(extack, "Specified class not found");
1194 			return -ENOENT;
1195 		}
1196 
1197 		if (new && new->ops == &noqueue_qdisc_ops) {
1198 			NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class");
1199 			return -EINVAL;
1200 		}
1201 
1202 		if (new &&
1203 		    !(parent->flags & TCQ_F_MQROOT) &&
1204 		    rcu_access_pointer(new->stab)) {
1205 			NL_SET_ERR_MSG(extack, "STAB not supported on a non root");
1206 			return -EINVAL;
1207 		}
1208 		err = cops->graft(parent, cl, new, &old, extack);
1209 		if (err)
1210 			return err;
1211 		notify_and_destroy(net, skb, n, classid, old, new, extack);
1212 	}
1213 	return 0;
1214 }
1215 
1216 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1217 				   struct netlink_ext_ack *extack)
1218 {
1219 	u32 block_index;
1220 
1221 	if (tca[TCA_INGRESS_BLOCK]) {
1222 		block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1223 
1224 		if (!block_index) {
1225 			NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1226 			return -EINVAL;
1227 		}
1228 		if (!sch->ops->ingress_block_set) {
1229 			NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1230 			return -EOPNOTSUPP;
1231 		}
1232 		sch->ops->ingress_block_set(sch, block_index);
1233 	}
1234 	if (tca[TCA_EGRESS_BLOCK]) {
1235 		block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1236 
1237 		if (!block_index) {
1238 			NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1239 			return -EINVAL;
1240 		}
1241 		if (!sch->ops->egress_block_set) {
1242 			NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1243 			return -EOPNOTSUPP;
1244 		}
1245 		sch->ops->egress_block_set(sch, block_index);
1246 	}
1247 	return 0;
1248 }
1249 
1250 /*
1251    Allocate and initialize new qdisc.
1252 
1253    Parameters are passed via opt.
1254  */
1255 
1256 static struct Qdisc *qdisc_create(struct net_device *dev,
1257 				  struct netdev_queue *dev_queue,
1258 				  u32 parent, u32 handle,
1259 				  struct nlattr **tca, int *errp,
1260 				  struct netlink_ext_ack *extack)
1261 {
1262 	int err;
1263 	struct nlattr *kind = tca[TCA_KIND];
1264 	struct Qdisc *sch;
1265 	struct Qdisc_ops *ops;
1266 	struct qdisc_size_table *stab;
1267 
1268 	ops = qdisc_lookup_ops(kind);
1269 #ifdef CONFIG_MODULES
1270 	if (ops == NULL && kind != NULL) {
1271 		char name[IFNAMSIZ];
1272 		if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) {
1273 			/* We dropped the RTNL semaphore in order to
1274 			 * perform the module load.  So, even if we
1275 			 * succeeded in loading the module we have to
1276 			 * tell the caller to replay the request.  We
1277 			 * indicate this using -EAGAIN.
1278 			 * We replay the request because the device may
1279 			 * go away in the mean time.
1280 			 */
1281 			rtnl_unlock();
1282 			request_module(NET_SCH_ALIAS_PREFIX "%s", name);
1283 			rtnl_lock();
1284 			ops = qdisc_lookup_ops(kind);
1285 			if (ops != NULL) {
1286 				/* We will try again qdisc_lookup_ops,
1287 				 * so don't keep a reference.
1288 				 */
1289 				module_put(ops->owner);
1290 				err = -EAGAIN;
1291 				goto err_out;
1292 			}
1293 		}
1294 	}
1295 #endif
1296 
1297 	err = -ENOENT;
1298 	if (!ops) {
1299 		NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
1300 		goto err_out;
1301 	}
1302 
1303 	sch = qdisc_alloc(dev_queue, ops, extack);
1304 	if (IS_ERR(sch)) {
1305 		err = PTR_ERR(sch);
1306 		goto err_out2;
1307 	}
1308 
1309 	sch->parent = parent;
1310 
1311 	if (handle == TC_H_INGRESS) {
1312 		if (!(sch->flags & TCQ_F_INGRESS)) {
1313 			NL_SET_ERR_MSG(extack,
1314 				       "Specified parent ID is reserved for ingress and clsact Qdiscs");
1315 			err = -EINVAL;
1316 			goto err_out3;
1317 		}
1318 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
1319 	} else {
1320 		if (handle == 0) {
1321 			handle = qdisc_alloc_handle(dev);
1322 			if (handle == 0) {
1323 				NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1324 				err = -ENOSPC;
1325 				goto err_out3;
1326 			}
1327 		}
1328 		if (!netif_is_multiqueue(dev))
1329 			sch->flags |= TCQ_F_ONETXQUEUE;
1330 	}
1331 
1332 	sch->handle = handle;
1333 
1334 	/* This exist to keep backward compatible with a userspace
1335 	 * loophole, what allowed userspace to get IFF_NO_QUEUE
1336 	 * facility on older kernels by setting tx_queue_len=0 (prior
1337 	 * to qdisc init), and then forgot to reinit tx_queue_len
1338 	 * before again attaching a qdisc.
1339 	 */
1340 	if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1341 		WRITE_ONCE(dev->tx_queue_len, DEFAULT_TX_QUEUE_LEN);
1342 		netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1343 	}
1344 
1345 	err = qdisc_block_indexes_set(sch, tca, extack);
1346 	if (err)
1347 		goto err_out3;
1348 
1349 	if (tca[TCA_STAB]) {
1350 		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1351 		if (IS_ERR(stab)) {
1352 			err = PTR_ERR(stab);
1353 			goto err_out3;
1354 		}
1355 		rcu_assign_pointer(sch->stab, stab);
1356 	}
1357 
1358 	if (ops->init) {
1359 		err = ops->init(sch, tca[TCA_OPTIONS], extack);
1360 		if (err != 0)
1361 			goto err_out4;
1362 	}
1363 
1364 	if (tca[TCA_RATE]) {
1365 		err = -EOPNOTSUPP;
1366 		if (sch->flags & TCQ_F_MQROOT) {
1367 			NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1368 			goto err_out4;
1369 		}
1370 
1371 		err = gen_new_estimator(&sch->bstats,
1372 					sch->cpu_bstats,
1373 					&sch->rate_est,
1374 					NULL,
1375 					true,
1376 					tca[TCA_RATE]);
1377 		if (err) {
1378 			NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1379 			goto err_out4;
1380 		}
1381 	}
1382 
1383 	qdisc_hash_add(sch, false);
1384 	trace_qdisc_create(ops, dev, parent);
1385 
1386 	return sch;
1387 
1388 err_out4:
1389 	/* Even if ops->init() failed, we call ops->destroy()
1390 	 * like qdisc_create_dflt().
1391 	 */
1392 	if (ops->destroy)
1393 		ops->destroy(sch);
1394 	qdisc_put_stab(rtnl_dereference(sch->stab));
1395 err_out3:
1396 	lockdep_unregister_key(&sch->root_lock_key);
1397 	netdev_put(dev, &sch->dev_tracker);
1398 	qdisc_free(sch);
1399 err_out2:
1400 	module_put(ops->owner);
1401 err_out:
1402 	*errp = err;
1403 	return NULL;
1404 }
1405 
1406 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1407 			struct netlink_ext_ack *extack)
1408 {
1409 	struct qdisc_size_table *ostab, *stab = NULL;
1410 	int err = 0;
1411 
1412 	if (tca[TCA_OPTIONS]) {
1413 		if (!sch->ops->change) {
1414 			NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1415 			return -EINVAL;
1416 		}
1417 		if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1418 			NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1419 			return -EOPNOTSUPP;
1420 		}
1421 		err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1422 		if (err)
1423 			return err;
1424 	}
1425 
1426 	if (tca[TCA_STAB]) {
1427 		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1428 		if (IS_ERR(stab))
1429 			return PTR_ERR(stab);
1430 	}
1431 
1432 	ostab = rtnl_dereference(sch->stab);
1433 	rcu_assign_pointer(sch->stab, stab);
1434 	qdisc_put_stab(ostab);
1435 
1436 	if (tca[TCA_RATE]) {
1437 		/* NB: ignores errors from replace_estimator
1438 		   because change can't be undone. */
1439 		if (sch->flags & TCQ_F_MQROOT)
1440 			goto out;
1441 		gen_replace_estimator(&sch->bstats,
1442 				      sch->cpu_bstats,
1443 				      &sch->rate_est,
1444 				      NULL,
1445 				      true,
1446 				      tca[TCA_RATE]);
1447 	}
1448 out:
1449 	return 0;
1450 }
1451 
1452 struct check_loop_arg {
1453 	struct qdisc_walker	w;
1454 	struct Qdisc		*p;
1455 	int			depth;
1456 };
1457 
1458 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1459 			 struct qdisc_walker *w);
1460 
1461 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1462 {
1463 	struct check_loop_arg	arg;
1464 
1465 	if (q->ops->cl_ops == NULL)
1466 		return 0;
1467 
1468 	arg.w.stop = arg.w.skip = arg.w.count = 0;
1469 	arg.w.fn = check_loop_fn;
1470 	arg.depth = depth;
1471 	arg.p = p;
1472 	q->ops->cl_ops->walk(q, &arg.w);
1473 	return arg.w.stop ? -ELOOP : 0;
1474 }
1475 
1476 static int
1477 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1478 {
1479 	struct Qdisc *leaf;
1480 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1481 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
1482 
1483 	leaf = cops->leaf(q, cl);
1484 	if (leaf) {
1485 		if (leaf == arg->p || arg->depth > 7)
1486 			return -ELOOP;
1487 		return check_loop(leaf, arg->p, arg->depth + 1);
1488 	}
1489 	return 0;
1490 }
1491 
1492 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1493 	[TCA_KIND]		= { .type = NLA_STRING },
1494 	[TCA_RATE]		= { .type = NLA_BINARY,
1495 				    .len = sizeof(struct tc_estimator) },
1496 	[TCA_STAB]		= { .type = NLA_NESTED },
1497 	[TCA_DUMP_INVISIBLE]	= { .type = NLA_FLAG },
1498 	[TCA_CHAIN]		= { .type = NLA_U32 },
1499 	[TCA_INGRESS_BLOCK]	= { .type = NLA_U32 },
1500 	[TCA_EGRESS_BLOCK]	= { .type = NLA_U32 },
1501 };
1502 
1503 /*
1504  * Delete/get qdisc.
1505  */
1506 
1507 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1508 			struct netlink_ext_ack *extack)
1509 {
1510 	struct net *net = sock_net(skb->sk);
1511 	struct tcmsg *tcm = nlmsg_data(n);
1512 	struct nlattr *tca[TCA_MAX + 1];
1513 	struct net_device *dev;
1514 	u32 clid;
1515 	struct Qdisc *q = NULL;
1516 	struct Qdisc *p = NULL;
1517 	int err;
1518 
1519 	err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1520 				     rtm_tca_policy, extack);
1521 	if (err < 0)
1522 		return err;
1523 
1524 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1525 	if (!dev)
1526 		return -ENODEV;
1527 
1528 	clid = tcm->tcm_parent;
1529 	if (clid) {
1530 		if (clid != TC_H_ROOT) {
1531 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1532 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1533 				if (!p) {
1534 					NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1535 					return -ENOENT;
1536 				}
1537 				q = qdisc_leaf(p, clid);
1538 			} else if (dev_ingress_queue(dev)) {
1539 				q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping);
1540 			}
1541 		} else {
1542 			q = rtnl_dereference(dev->qdisc);
1543 		}
1544 		if (!q) {
1545 			NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1546 			return -ENOENT;
1547 		}
1548 
1549 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1550 			NL_SET_ERR_MSG(extack, "Invalid handle");
1551 			return -EINVAL;
1552 		}
1553 	} else {
1554 		q = qdisc_lookup(dev, tcm->tcm_handle);
1555 		if (!q) {
1556 			NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1557 			return -ENOENT;
1558 		}
1559 	}
1560 
1561 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1562 		NL_SET_ERR_MSG(extack, "Invalid qdisc name: must match existing qdisc");
1563 		return -EINVAL;
1564 	}
1565 
1566 	if (n->nlmsg_type == RTM_DELQDISC) {
1567 		if (!clid) {
1568 			NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1569 			return -EINVAL;
1570 		}
1571 		if (q->handle == 0) {
1572 			NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1573 			return -ENOENT;
1574 		}
1575 		err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1576 		if (err != 0)
1577 			return err;
1578 	} else {
1579 		qdisc_get_notify(net, skb, n, clid, q, NULL);
1580 	}
1581 	return 0;
1582 }
1583 
1584 static bool req_create_or_replace(struct nlmsghdr *n)
1585 {
1586 	return (n->nlmsg_flags & NLM_F_CREATE &&
1587 		n->nlmsg_flags & NLM_F_REPLACE);
1588 }
1589 
1590 static bool req_create_exclusive(struct nlmsghdr *n)
1591 {
1592 	return (n->nlmsg_flags & NLM_F_CREATE &&
1593 		n->nlmsg_flags & NLM_F_EXCL);
1594 }
1595 
1596 static bool req_change(struct nlmsghdr *n)
1597 {
1598 	return (!(n->nlmsg_flags & NLM_F_CREATE) &&
1599 		!(n->nlmsg_flags & NLM_F_REPLACE) &&
1600 		!(n->nlmsg_flags & NLM_F_EXCL));
1601 }
1602 
1603 /*
1604  * Create/change qdisc.
1605  */
1606 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1607 			   struct netlink_ext_ack *extack)
1608 {
1609 	struct net *net = sock_net(skb->sk);
1610 	struct tcmsg *tcm;
1611 	struct nlattr *tca[TCA_MAX + 1];
1612 	struct net_device *dev;
1613 	u32 clid;
1614 	struct Qdisc *q, *p;
1615 	int err;
1616 
1617 replay:
1618 	/* Reinit, just in case something touches this. */
1619 	err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1620 				     rtm_tca_policy, extack);
1621 	if (err < 0)
1622 		return err;
1623 
1624 	tcm = nlmsg_data(n);
1625 	clid = tcm->tcm_parent;
1626 	q = p = NULL;
1627 
1628 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1629 	if (!dev)
1630 		return -ENODEV;
1631 
1632 
1633 	if (clid) {
1634 		if (clid != TC_H_ROOT) {
1635 			if (clid != TC_H_INGRESS) {
1636 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1637 				if (!p) {
1638 					NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1639 					return -ENOENT;
1640 				}
1641 				q = qdisc_leaf(p, clid);
1642 			} else if (dev_ingress_queue_create(dev)) {
1643 				q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping);
1644 			}
1645 		} else {
1646 			q = rtnl_dereference(dev->qdisc);
1647 		}
1648 
1649 		/* It may be default qdisc, ignore it */
1650 		if (q && q->handle == 0)
1651 			q = NULL;
1652 
1653 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1654 			if (tcm->tcm_handle) {
1655 				if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1656 					NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1657 					return -EEXIST;
1658 				}
1659 				if (TC_H_MIN(tcm->tcm_handle)) {
1660 					NL_SET_ERR_MSG(extack, "Invalid minor handle");
1661 					return -EINVAL;
1662 				}
1663 				q = qdisc_lookup(dev, tcm->tcm_handle);
1664 				if (!q)
1665 					goto create_n_graft;
1666 				if (q->parent != tcm->tcm_parent) {
1667 					NL_SET_ERR_MSG(extack, "Cannot move an existing qdisc to a different parent");
1668 					return -EINVAL;
1669 				}
1670 				if (n->nlmsg_flags & NLM_F_EXCL) {
1671 					NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1672 					return -EEXIST;
1673 				}
1674 				if (tca[TCA_KIND] &&
1675 				    nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1676 					NL_SET_ERR_MSG(extack, "Invalid qdisc name: must match existing qdisc");
1677 					return -EINVAL;
1678 				}
1679 				if (q->flags & TCQ_F_INGRESS) {
1680 					NL_SET_ERR_MSG(extack,
1681 						       "Cannot regraft ingress or clsact Qdiscs");
1682 					return -EINVAL;
1683 				}
1684 				if (q == p ||
1685 				    (p && check_loop(q, p, 0))) {
1686 					NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1687 					return -ELOOP;
1688 				}
1689 				if (clid == TC_H_INGRESS) {
1690 					NL_SET_ERR_MSG(extack, "Ingress cannot graft directly");
1691 					return -EINVAL;
1692 				}
1693 				qdisc_refcount_inc(q);
1694 				goto graft;
1695 			} else {
1696 				if (!q)
1697 					goto create_n_graft;
1698 
1699 				/* This magic test requires explanation.
1700 				 *
1701 				 *   We know, that some child q is already
1702 				 *   attached to this parent and have choice:
1703 				 *   1) change it or 2) create/graft new one.
1704 				 *   If the requested qdisc kind is different
1705 				 *   than the existing one, then we choose graft.
1706 				 *   If they are the same then this is "change"
1707 				 *   operation - just let it fallthrough..
1708 				 *
1709 				 *   1. We are allowed to create/graft only
1710 				 *   if the request is explicitly stating
1711 				 *   "please create if it doesn't exist".
1712 				 *
1713 				 *   2. If the request is to exclusive create
1714 				 *   then the qdisc tcm_handle is not expected
1715 				 *   to exist, so that we choose create/graft too.
1716 				 *
1717 				 *   3. The last case is when no flags are set.
1718 				 *   This will happen when for example tc
1719 				 *   utility issues a "change" command.
1720 				 *   Alas, it is sort of hole in API, we
1721 				 *   cannot decide what to do unambiguously.
1722 				 *   For now we select create/graft.
1723 				 */
1724 				if (tca[TCA_KIND] &&
1725 				    nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1726 					if (req_create_or_replace(n) ||
1727 					    req_create_exclusive(n))
1728 						goto create_n_graft;
1729 					else if (req_change(n))
1730 						goto create_n_graft2;
1731 				}
1732 			}
1733 		}
1734 	} else {
1735 		if (!tcm->tcm_handle) {
1736 			NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1737 			return -EINVAL;
1738 		}
1739 		q = qdisc_lookup(dev, tcm->tcm_handle);
1740 	}
1741 
1742 	/* Change qdisc parameters */
1743 	if (!q) {
1744 		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1745 		return -ENOENT;
1746 	}
1747 	if (n->nlmsg_flags & NLM_F_EXCL) {
1748 		NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1749 		return -EEXIST;
1750 	}
1751 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1752 		NL_SET_ERR_MSG(extack, "Invalid qdisc name: must match existing qdisc");
1753 		return -EINVAL;
1754 	}
1755 	err = qdisc_change(q, tca, extack);
1756 	if (err == 0)
1757 		qdisc_notify(net, skb, n, clid, NULL, q, extack);
1758 	return err;
1759 
1760 create_n_graft:
1761 	if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1762 		NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1763 		return -ENOENT;
1764 	}
1765 create_n_graft2:
1766 	if (clid == TC_H_INGRESS) {
1767 		if (dev_ingress_queue(dev)) {
1768 			q = qdisc_create(dev, dev_ingress_queue(dev),
1769 					 tcm->tcm_parent, tcm->tcm_parent,
1770 					 tca, &err, extack);
1771 		} else {
1772 			NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1773 			err = -ENOENT;
1774 		}
1775 	} else {
1776 		struct netdev_queue *dev_queue;
1777 
1778 		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1779 			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1780 		else if (p)
1781 			dev_queue = p->dev_queue;
1782 		else
1783 			dev_queue = netdev_get_tx_queue(dev, 0);
1784 
1785 		q = qdisc_create(dev, dev_queue,
1786 				 tcm->tcm_parent, tcm->tcm_handle,
1787 				 tca, &err, extack);
1788 	}
1789 	if (q == NULL) {
1790 		if (err == -EAGAIN)
1791 			goto replay;
1792 		return err;
1793 	}
1794 
1795 graft:
1796 	err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1797 	if (err) {
1798 		if (q)
1799 			qdisc_put(q);
1800 		return err;
1801 	}
1802 
1803 	return 0;
1804 }
1805 
1806 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1807 			      struct netlink_callback *cb,
1808 			      int *q_idx_p, int s_q_idx, bool recur,
1809 			      bool dump_invisible)
1810 {
1811 	int ret = 0, q_idx = *q_idx_p;
1812 	struct Qdisc *q;
1813 	int b;
1814 
1815 	if (!root)
1816 		return 0;
1817 
1818 	q = root;
1819 	if (q_idx < s_q_idx) {
1820 		q_idx++;
1821 	} else {
1822 		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1823 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1824 				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1825 				  RTM_NEWQDISC, NULL) <= 0)
1826 			goto done;
1827 		q_idx++;
1828 	}
1829 
1830 	/* If dumping singletons, there is no qdisc_dev(root) and the singleton
1831 	 * itself has already been dumped.
1832 	 *
1833 	 * If we've already dumped the top-level (ingress) qdisc above and the global
1834 	 * qdisc hashtable, we don't want to hit it again
1835 	 */
1836 	if (!qdisc_dev(root) || !recur)
1837 		goto out;
1838 
1839 	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1840 		if (q_idx < s_q_idx) {
1841 			q_idx++;
1842 			continue;
1843 		}
1844 		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1845 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1846 				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1847 				  RTM_NEWQDISC, NULL) <= 0)
1848 			goto done;
1849 		q_idx++;
1850 	}
1851 
1852 out:
1853 	*q_idx_p = q_idx;
1854 	return ret;
1855 done:
1856 	ret = -1;
1857 	goto out;
1858 }
1859 
1860 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1861 {
1862 	struct net *net = sock_net(skb->sk);
1863 	int idx, q_idx;
1864 	int s_idx, s_q_idx;
1865 	struct net_device *dev;
1866 	const struct nlmsghdr *nlh = cb->nlh;
1867 	struct nlattr *tca[TCA_MAX + 1];
1868 	int err;
1869 
1870 	s_idx = cb->args[0];
1871 	s_q_idx = q_idx = cb->args[1];
1872 
1873 	idx = 0;
1874 	ASSERT_RTNL();
1875 
1876 	err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1877 				     rtm_tca_policy, cb->extack);
1878 	if (err < 0)
1879 		return err;
1880 
1881 	for_each_netdev(net, dev) {
1882 		struct netdev_queue *dev_queue;
1883 
1884 		if (idx < s_idx)
1885 			goto cont;
1886 		if (idx > s_idx)
1887 			s_q_idx = 0;
1888 		q_idx = 0;
1889 
1890 		if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc),
1891 				       skb, cb, &q_idx, s_q_idx,
1892 				       true, tca[TCA_DUMP_INVISIBLE]) < 0)
1893 			goto done;
1894 
1895 		dev_queue = dev_ingress_queue(dev);
1896 		if (dev_queue &&
1897 		    tc_dump_qdisc_root(rtnl_dereference(dev_queue->qdisc_sleeping),
1898 				       skb, cb, &q_idx, s_q_idx, false,
1899 				       tca[TCA_DUMP_INVISIBLE]) < 0)
1900 			goto done;
1901 
1902 cont:
1903 		idx++;
1904 	}
1905 
1906 done:
1907 	cb->args[0] = idx;
1908 	cb->args[1] = q_idx;
1909 
1910 	return skb->len;
1911 }
1912 
1913 
1914 
1915 /************************************************
1916  *	Traffic classes manipulation.		*
1917  ************************************************/
1918 
1919 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1920 			  unsigned long cl, u32 portid, u32 seq, u16 flags,
1921 			  int event, struct netlink_ext_ack *extack)
1922 {
1923 	struct tcmsg *tcm;
1924 	struct nlmsghdr  *nlh;
1925 	unsigned char *b = skb_tail_pointer(skb);
1926 	struct gnet_dump d;
1927 	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1928 
1929 	cond_resched();
1930 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1931 	if (!nlh)
1932 		goto out_nlmsg_trim;
1933 	tcm = nlmsg_data(nlh);
1934 	tcm->tcm_family = AF_UNSPEC;
1935 	tcm->tcm__pad1 = 0;
1936 	tcm->tcm__pad2 = 0;
1937 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1938 	tcm->tcm_parent = q->handle;
1939 	tcm->tcm_handle = q->handle;
1940 	tcm->tcm_info = 0;
1941 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
1942 		goto nla_put_failure;
1943 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1944 		goto nla_put_failure;
1945 
1946 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1947 					 NULL, &d, TCA_PAD) < 0)
1948 		goto nla_put_failure;
1949 
1950 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1951 		goto nla_put_failure;
1952 
1953 	if (gnet_stats_finish_copy(&d) < 0)
1954 		goto nla_put_failure;
1955 
1956 	if (extack && extack->_msg &&
1957 	    nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
1958 		goto out_nlmsg_trim;
1959 
1960 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1961 
1962 	return skb->len;
1963 
1964 out_nlmsg_trim:
1965 nla_put_failure:
1966 	nlmsg_trim(skb, b);
1967 	return -1;
1968 }
1969 
1970 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1971 			 struct nlmsghdr *n, struct Qdisc *q,
1972 			 unsigned long cl, int event, struct netlink_ext_ack *extack)
1973 {
1974 	struct sk_buff *skb;
1975 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1976 
1977 	if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC))
1978 		return 0;
1979 
1980 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1981 	if (!skb)
1982 		return -ENOBUFS;
1983 
1984 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event, extack) < 0) {
1985 		kfree_skb(skb);
1986 		return -EINVAL;
1987 	}
1988 
1989 	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1990 			      n->nlmsg_flags & NLM_F_ECHO);
1991 }
1992 
1993 static int tclass_get_notify(struct net *net, struct sk_buff *oskb,
1994 			     struct nlmsghdr *n, struct Qdisc *q,
1995 			     unsigned long cl, struct netlink_ext_ack *extack)
1996 {
1997 	struct sk_buff *skb;
1998 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1999 
2000 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2001 	if (!skb)
2002 		return -ENOBUFS;
2003 
2004 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, RTM_NEWTCLASS,
2005 			   extack) < 0) {
2006 		kfree_skb(skb);
2007 		return -EINVAL;
2008 	}
2009 
2010 	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
2011 			      n->nlmsg_flags & NLM_F_ECHO);
2012 }
2013 
2014 static int tclass_del_notify(struct net *net,
2015 			     const struct Qdisc_class_ops *cops,
2016 			     struct sk_buff *oskb, struct nlmsghdr *n,
2017 			     struct Qdisc *q, unsigned long cl,
2018 			     struct netlink_ext_ack *extack)
2019 {
2020 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
2021 	struct sk_buff *skb;
2022 	int err = 0;
2023 
2024 	if (!cops->delete)
2025 		return -EOPNOTSUPP;
2026 
2027 	if (rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) {
2028 		skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2029 		if (!skb)
2030 			return -ENOBUFS;
2031 
2032 		if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
2033 				   RTM_DELTCLASS, extack) < 0) {
2034 			kfree_skb(skb);
2035 			return -EINVAL;
2036 		}
2037 	} else {
2038 		skb = NULL;
2039 	}
2040 
2041 	err = cops->delete(q, cl, extack);
2042 	if (err) {
2043 		kfree_skb(skb);
2044 		return err;
2045 	}
2046 
2047 	err = rtnetlink_maybe_send(skb, net, portid, RTNLGRP_TC,
2048 				   n->nlmsg_flags & NLM_F_ECHO);
2049 	return err;
2050 }
2051 
2052 #ifdef CONFIG_NET_CLS
2053 
2054 struct tcf_bind_args {
2055 	struct tcf_walker w;
2056 	unsigned long base;
2057 	unsigned long cl;
2058 	u32 classid;
2059 };
2060 
2061 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
2062 {
2063 	struct tcf_bind_args *a = (void *)arg;
2064 
2065 	if (n && tp->ops->bind_class) {
2066 		struct Qdisc *q = tcf_block_q(tp->chain->block);
2067 
2068 		sch_tree_lock(q);
2069 		tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
2070 		sch_tree_unlock(q);
2071 	}
2072 	return 0;
2073 }
2074 
2075 struct tc_bind_class_args {
2076 	struct qdisc_walker w;
2077 	unsigned long new_cl;
2078 	u32 portid;
2079 	u32 clid;
2080 };
2081 
2082 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
2083 				struct qdisc_walker *w)
2084 {
2085 	struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
2086 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
2087 	struct tcf_block *block;
2088 	struct tcf_chain *chain;
2089 
2090 	block = cops->tcf_block(q, cl, NULL);
2091 	if (!block)
2092 		return 0;
2093 	for (chain = tcf_get_next_chain(block, NULL);
2094 	     chain;
2095 	     chain = tcf_get_next_chain(block, chain)) {
2096 		struct tcf_proto *tp;
2097 
2098 		for (tp = tcf_get_next_proto(chain, NULL);
2099 		     tp; tp = tcf_get_next_proto(chain, tp)) {
2100 			struct tcf_bind_args arg = {};
2101 
2102 			arg.w.fn = tcf_node_bind;
2103 			arg.classid = a->clid;
2104 			arg.base = cl;
2105 			arg.cl = a->new_cl;
2106 			tp->ops->walk(tp, &arg.w, true);
2107 		}
2108 	}
2109 
2110 	return 0;
2111 }
2112 
2113 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2114 			   unsigned long new_cl)
2115 {
2116 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
2117 	struct tc_bind_class_args args = {};
2118 
2119 	if (!cops->tcf_block)
2120 		return;
2121 	args.portid = portid;
2122 	args.clid = clid;
2123 	args.new_cl = new_cl;
2124 	args.w.fn = tc_bind_class_walker;
2125 	q->ops->cl_ops->walk(q, &args.w);
2126 }
2127 
2128 #else
2129 
2130 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2131 			   unsigned long new_cl)
2132 {
2133 }
2134 
2135 #endif
2136 
2137 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
2138 			 struct netlink_ext_ack *extack)
2139 {
2140 	struct net *net = sock_net(skb->sk);
2141 	struct tcmsg *tcm = nlmsg_data(n);
2142 	struct nlattr *tca[TCA_MAX + 1];
2143 	struct net_device *dev;
2144 	struct Qdisc *q = NULL;
2145 	const struct Qdisc_class_ops *cops;
2146 	unsigned long cl = 0;
2147 	unsigned long new_cl;
2148 	u32 portid;
2149 	u32 clid;
2150 	u32 qid;
2151 	int err;
2152 
2153 	err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2154 				     rtm_tca_policy, extack);
2155 	if (err < 0)
2156 		return err;
2157 
2158 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2159 	if (!dev)
2160 		return -ENODEV;
2161 
2162 	/*
2163 	   parent == TC_H_UNSPEC - unspecified parent.
2164 	   parent == TC_H_ROOT   - class is root, which has no parent.
2165 	   parent == X:0	 - parent is root class.
2166 	   parent == X:Y	 - parent is a node in hierarchy.
2167 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
2168 
2169 	   handle == 0:0	 - generate handle from kernel pool.
2170 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
2171 	   handle == X:Y	 - clear.
2172 	   handle == X:0	 - root class.
2173 	 */
2174 
2175 	/* Step 1. Determine qdisc handle X:0 */
2176 
2177 	portid = tcm->tcm_parent;
2178 	clid = tcm->tcm_handle;
2179 	qid = TC_H_MAJ(clid);
2180 
2181 	if (portid != TC_H_ROOT) {
2182 		u32 qid1 = TC_H_MAJ(portid);
2183 
2184 		if (qid && qid1) {
2185 			/* If both majors are known, they must be identical. */
2186 			if (qid != qid1)
2187 				return -EINVAL;
2188 		} else if (qid1) {
2189 			qid = qid1;
2190 		} else if (qid == 0)
2191 			qid = rtnl_dereference(dev->qdisc)->handle;
2192 
2193 		/* Now qid is genuine qdisc handle consistent
2194 		 * both with parent and child.
2195 		 *
2196 		 * TC_H_MAJ(portid) still may be unspecified, complete it now.
2197 		 */
2198 		if (portid)
2199 			portid = TC_H_MAKE(qid, portid);
2200 	} else {
2201 		if (qid == 0)
2202 			qid = rtnl_dereference(dev->qdisc)->handle;
2203 	}
2204 
2205 	/* OK. Locate qdisc */
2206 	q = qdisc_lookup(dev, qid);
2207 	if (!q)
2208 		return -ENOENT;
2209 
2210 	/* An check that it supports classes */
2211 	cops = q->ops->cl_ops;
2212 	if (cops == NULL)
2213 		return -EINVAL;
2214 
2215 	/* Now try to get class */
2216 	if (clid == 0) {
2217 		if (portid == TC_H_ROOT)
2218 			clid = qid;
2219 	} else
2220 		clid = TC_H_MAKE(qid, clid);
2221 
2222 	if (clid)
2223 		cl = cops->find(q, clid);
2224 
2225 	if (cl == 0) {
2226 		err = -ENOENT;
2227 		if (n->nlmsg_type != RTM_NEWTCLASS ||
2228 		    !(n->nlmsg_flags & NLM_F_CREATE))
2229 			goto out;
2230 	} else {
2231 		switch (n->nlmsg_type) {
2232 		case RTM_NEWTCLASS:
2233 			err = -EEXIST;
2234 			if (n->nlmsg_flags & NLM_F_EXCL)
2235 				goto out;
2236 			break;
2237 		case RTM_DELTCLASS:
2238 			err = tclass_del_notify(net, cops, skb, n, q, cl, extack);
2239 			/* Unbind the class with flilters with 0 */
2240 			tc_bind_tclass(q, portid, clid, 0);
2241 			goto out;
2242 		case RTM_GETTCLASS:
2243 			err = tclass_get_notify(net, skb, n, q, cl, extack);
2244 			goto out;
2245 		default:
2246 			err = -EINVAL;
2247 			goto out;
2248 		}
2249 	}
2250 
2251 	if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2252 		NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2253 		return -EOPNOTSUPP;
2254 	}
2255 
2256 	/* Prevent creation of traffic classes with classid TC_H_ROOT */
2257 	if (clid == TC_H_ROOT) {
2258 		NL_SET_ERR_MSG(extack, "Cannot create traffic class with classid TC_H_ROOT");
2259 		return -EINVAL;
2260 	}
2261 
2262 	new_cl = cl;
2263 	err = -EOPNOTSUPP;
2264 	if (cops->change)
2265 		err = cops->change(q, clid, portid, tca, &new_cl, extack);
2266 	if (err == 0) {
2267 		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS, extack);
2268 		/* We just create a new class, need to do reverse binding. */
2269 		if (cl != new_cl)
2270 			tc_bind_tclass(q, portid, clid, new_cl);
2271 	}
2272 out:
2273 	return err;
2274 }
2275 
2276 struct qdisc_dump_args {
2277 	struct qdisc_walker	w;
2278 	struct sk_buff		*skb;
2279 	struct netlink_callback	*cb;
2280 };
2281 
2282 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2283 			    struct qdisc_walker *arg)
2284 {
2285 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2286 
2287 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2288 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2289 			      RTM_NEWTCLASS, NULL);
2290 }
2291 
2292 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2293 				struct tcmsg *tcm, struct netlink_callback *cb,
2294 				int *t_p, int s_t)
2295 {
2296 	struct qdisc_dump_args arg;
2297 
2298 	if (tc_qdisc_dump_ignore(q, false) ||
2299 	    *t_p < s_t || !q->ops->cl_ops ||
2300 	    (tcm->tcm_parent &&
2301 	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2302 		(*t_p)++;
2303 		return 0;
2304 	}
2305 	if (*t_p > s_t)
2306 		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2307 	arg.w.fn = qdisc_class_dump;
2308 	arg.skb = skb;
2309 	arg.cb = cb;
2310 	arg.w.stop  = 0;
2311 	arg.w.skip = cb->args[1];
2312 	arg.w.count = 0;
2313 	q->ops->cl_ops->walk(q, &arg.w);
2314 	cb->args[1] = arg.w.count;
2315 	if (arg.w.stop)
2316 		return -1;
2317 	(*t_p)++;
2318 	return 0;
2319 }
2320 
2321 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2322 			       struct tcmsg *tcm, struct netlink_callback *cb,
2323 			       int *t_p, int s_t, bool recur)
2324 {
2325 	struct Qdisc *q;
2326 	int b;
2327 
2328 	if (!root)
2329 		return 0;
2330 
2331 	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2332 		return -1;
2333 
2334 	if (!qdisc_dev(root) || !recur)
2335 		return 0;
2336 
2337 	if (tcm->tcm_parent) {
2338 		q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2339 		if (q && q != root &&
2340 		    tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2341 			return -1;
2342 		return 0;
2343 	}
2344 	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2345 		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2346 			return -1;
2347 	}
2348 
2349 	return 0;
2350 }
2351 
2352 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2353 {
2354 	struct tcmsg *tcm = nlmsg_data(cb->nlh);
2355 	struct net *net = sock_net(skb->sk);
2356 	struct netdev_queue *dev_queue;
2357 	struct net_device *dev;
2358 	int t, s_t;
2359 
2360 	if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2361 		return 0;
2362 	dev = dev_get_by_index(net, tcm->tcm_ifindex);
2363 	if (!dev)
2364 		return 0;
2365 
2366 	s_t = cb->args[0];
2367 	t = 0;
2368 
2369 	if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc),
2370 				skb, tcm, cb, &t, s_t, true) < 0)
2371 		goto done;
2372 
2373 	dev_queue = dev_ingress_queue(dev);
2374 	if (dev_queue &&
2375 	    tc_dump_tclass_root(rtnl_dereference(dev_queue->qdisc_sleeping),
2376 				skb, tcm, cb, &t, s_t, false) < 0)
2377 		goto done;
2378 
2379 done:
2380 	cb->args[0] = t;
2381 
2382 	dev_put(dev);
2383 	return skb->len;
2384 }
2385 
2386 #ifdef CONFIG_PROC_FS
2387 static int psched_show(struct seq_file *seq, void *v)
2388 {
2389 	seq_printf(seq, "%08x %08x %08x %08x\n",
2390 		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2391 		   1000000,
2392 		   (u32)NSEC_PER_SEC / hrtimer_resolution);
2393 
2394 	return 0;
2395 }
2396 
2397 static int __net_init psched_net_init(struct net *net)
2398 {
2399 	struct proc_dir_entry *e;
2400 
2401 	e = proc_create_single("psched", 0, net->proc_net, psched_show);
2402 	if (e == NULL)
2403 		return -ENOMEM;
2404 
2405 	return 0;
2406 }
2407 
2408 static void __net_exit psched_net_exit(struct net *net)
2409 {
2410 	remove_proc_entry("psched", net->proc_net);
2411 }
2412 #else
2413 static int __net_init psched_net_init(struct net *net)
2414 {
2415 	return 0;
2416 }
2417 
2418 static void __net_exit psched_net_exit(struct net *net)
2419 {
2420 }
2421 #endif
2422 
2423 static struct pernet_operations psched_net_ops = {
2424 	.init = psched_net_init,
2425 	.exit = psched_net_exit,
2426 };
2427 
2428 #if IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)
2429 DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper);
2430 #endif
2431 
2432 static const struct rtnl_msg_handler psched_rtnl_msg_handlers[] __initconst = {
2433 	{.msgtype = RTM_NEWQDISC, .doit = tc_modify_qdisc},
2434 	{.msgtype = RTM_DELQDISC, .doit = tc_get_qdisc},
2435 	{.msgtype = RTM_GETQDISC, .doit = tc_get_qdisc,
2436 	 .dumpit = tc_dump_qdisc},
2437 	{.msgtype = RTM_NEWTCLASS, .doit = tc_ctl_tclass},
2438 	{.msgtype = RTM_DELTCLASS, .doit = tc_ctl_tclass},
2439 	{.msgtype = RTM_GETTCLASS, .doit = tc_ctl_tclass,
2440 	 .dumpit = tc_dump_tclass},
2441 };
2442 
2443 static int __init pktsched_init(void)
2444 {
2445 	int err;
2446 
2447 	err = register_pernet_subsys(&psched_net_ops);
2448 	if (err) {
2449 		pr_err("pktsched_init: "
2450 		       "cannot initialize per netns operations\n");
2451 		return err;
2452 	}
2453 
2454 	register_qdisc(&pfifo_fast_ops);
2455 	register_qdisc(&pfifo_qdisc_ops);
2456 	register_qdisc(&bfifo_qdisc_ops);
2457 	register_qdisc(&pfifo_head_drop_qdisc_ops);
2458 	register_qdisc(&mq_qdisc_ops);
2459 	register_qdisc(&noqueue_qdisc_ops);
2460 
2461 	rtnl_register_many(psched_rtnl_msg_handlers);
2462 
2463 	tc_wrapper_init();
2464 
2465 	return 0;
2466 }
2467 
2468 subsys_initcall(pktsched_init);
2469