xref: /linux/net/sched/sch_api.c (revision dd2934a95701576203b2f61e8ded4e4a2f9183ea)
1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/slab.h>
31 #include <linux/hashtable.h>
32 
33 #include <net/net_namespace.h>
34 #include <net/sock.h>
35 #include <net/netlink.h>
36 #include <net/pkt_sched.h>
37 #include <net/pkt_cls.h>
38 
39 /*
40 
41    Short review.
42    -------------
43 
44    This file consists of two interrelated parts:
45 
46    1. queueing disciplines manager frontend.
47    2. traffic classes manager frontend.
48 
49    Generally, queueing discipline ("qdisc") is a black box,
50    which is able to enqueue packets and to dequeue them (when
51    device is ready to send something) in order and at times
52    determined by algorithm hidden in it.
53 
54    qdisc's are divided to two categories:
55    - "queues", which have no internal structure visible from outside.
56    - "schedulers", which split all the packets to "traffic classes",
57      using "packet classifiers" (look at cls_api.c)
58 
59    In turn, classes may have child qdiscs (as rule, queues)
60    attached to them etc. etc. etc.
61 
62    The goal of the routines in this file is to translate
63    information supplied by user in the form of handles
64    to more intelligible for kernel form, to make some sanity
65    checks and part of work, which is common to all qdiscs
66    and to provide rtnetlink notifications.
67 
68    All real intelligent work is done inside qdisc modules.
69 
70 
71 
72    Every discipline has two major routines: enqueue and dequeue.
73 
74    ---dequeue
75 
76    dequeue usually returns a skb to send. It is allowed to return NULL,
77    but it does not mean that queue is empty, it just means that
78    discipline does not want to send anything this time.
79    Queue is really empty if q->q.qlen == 0.
80    For complicated disciplines with multiple queues q->q is not
81    real packet queue, but however q->q.qlen must be valid.
82 
83    ---enqueue
84 
85    enqueue returns 0, if packet was enqueued successfully.
86    If packet (this one or another one) was dropped, it returns
87    not zero error code.
88    NET_XMIT_DROP 	- this packet dropped
89      Expected action: do not backoff, but wait until queue will clear.
90    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
91      Expected action: backoff or ignore
92 
93    Auxiliary routines:
94 
95    ---peek
96 
97    like dequeue but without removing a packet from the queue
98 
99    ---reset
100 
101    returns qdisc to initial state: purge all buffers, clear all
102    timers, counters (except for statistics) etc.
103 
104    ---init
105 
106    initializes newly created qdisc.
107 
108    ---destroy
109 
110    destroys resources allocated by init and during lifetime of qdisc.
111 
112    ---change
113 
114    changes qdisc parameters.
115  */
116 
117 /* Protects list of registered TC modules. It is pure SMP lock. */
118 static DEFINE_RWLOCK(qdisc_mod_lock);
119 
120 
121 /************************************************
122  *	Queueing disciplines manipulation.	*
123  ************************************************/
124 
125 
126 /* The list of all installed queueing disciplines. */
127 
128 static struct Qdisc_ops *qdisc_base;
129 
130 /* Register/unregister queueing discipline */
131 
132 int register_qdisc(struct Qdisc_ops *qops)
133 {
134 	struct Qdisc_ops *q, **qp;
135 	int rc = -EEXIST;
136 
137 	write_lock(&qdisc_mod_lock);
138 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
139 		if (!strcmp(qops->id, q->id))
140 			goto out;
141 
142 	if (qops->enqueue == NULL)
143 		qops->enqueue = noop_qdisc_ops.enqueue;
144 	if (qops->peek == NULL) {
145 		if (qops->dequeue == NULL)
146 			qops->peek = noop_qdisc_ops.peek;
147 		else
148 			goto out_einval;
149 	}
150 	if (qops->dequeue == NULL)
151 		qops->dequeue = noop_qdisc_ops.dequeue;
152 
153 	if (qops->cl_ops) {
154 		const struct Qdisc_class_ops *cops = qops->cl_ops;
155 
156 		if (!(cops->find && cops->walk && cops->leaf))
157 			goto out_einval;
158 
159 		if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
160 			goto out_einval;
161 	}
162 
163 	qops->next = NULL;
164 	*qp = qops;
165 	rc = 0;
166 out:
167 	write_unlock(&qdisc_mod_lock);
168 	return rc;
169 
170 out_einval:
171 	rc = -EINVAL;
172 	goto out;
173 }
174 EXPORT_SYMBOL(register_qdisc);
175 
176 int unregister_qdisc(struct Qdisc_ops *qops)
177 {
178 	struct Qdisc_ops *q, **qp;
179 	int err = -ENOENT;
180 
181 	write_lock(&qdisc_mod_lock);
182 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
183 		if (q == qops)
184 			break;
185 	if (q) {
186 		*qp = q->next;
187 		q->next = NULL;
188 		err = 0;
189 	}
190 	write_unlock(&qdisc_mod_lock);
191 	return err;
192 }
193 EXPORT_SYMBOL(unregister_qdisc);
194 
195 /* Get default qdisc if not otherwise specified */
196 void qdisc_get_default(char *name, size_t len)
197 {
198 	read_lock(&qdisc_mod_lock);
199 	strlcpy(name, default_qdisc_ops->id, len);
200 	read_unlock(&qdisc_mod_lock);
201 }
202 
203 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
204 {
205 	struct Qdisc_ops *q = NULL;
206 
207 	for (q = qdisc_base; q; q = q->next) {
208 		if (!strcmp(name, q->id)) {
209 			if (!try_module_get(q->owner))
210 				q = NULL;
211 			break;
212 		}
213 	}
214 
215 	return q;
216 }
217 
218 /* Set new default qdisc to use */
219 int qdisc_set_default(const char *name)
220 {
221 	const struct Qdisc_ops *ops;
222 
223 	if (!capable(CAP_NET_ADMIN))
224 		return -EPERM;
225 
226 	write_lock(&qdisc_mod_lock);
227 	ops = qdisc_lookup_default(name);
228 	if (!ops) {
229 		/* Not found, drop lock and try to load module */
230 		write_unlock(&qdisc_mod_lock);
231 		request_module("sch_%s", name);
232 		write_lock(&qdisc_mod_lock);
233 
234 		ops = qdisc_lookup_default(name);
235 	}
236 
237 	if (ops) {
238 		/* Set new default */
239 		module_put(default_qdisc_ops->owner);
240 		default_qdisc_ops = ops;
241 	}
242 	write_unlock(&qdisc_mod_lock);
243 
244 	return ops ? 0 : -ENOENT;
245 }
246 
247 #ifdef CONFIG_NET_SCH_DEFAULT
248 /* Set default value from kernel config */
249 static int __init sch_default_qdisc(void)
250 {
251 	return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
252 }
253 late_initcall(sch_default_qdisc);
254 #endif
255 
256 /* We know handle. Find qdisc among all qdisc's attached to device
257  * (root qdisc, all its children, children of children etc.)
258  * Note: caller either uses rtnl or rcu_read_lock()
259  */
260 
261 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
262 {
263 	struct Qdisc *q;
264 
265 	if (!qdisc_dev(root))
266 		return (root->handle == handle ? root : NULL);
267 
268 	if (!(root->flags & TCQ_F_BUILTIN) &&
269 	    root->handle == handle)
270 		return root;
271 
272 	hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
273 		if (q->handle == handle)
274 			return q;
275 	}
276 	return NULL;
277 }
278 
279 void qdisc_hash_add(struct Qdisc *q, bool invisible)
280 {
281 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
282 		ASSERT_RTNL();
283 		hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
284 		if (invisible)
285 			q->flags |= TCQ_F_INVISIBLE;
286 	}
287 }
288 EXPORT_SYMBOL(qdisc_hash_add);
289 
290 void qdisc_hash_del(struct Qdisc *q)
291 {
292 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
293 		ASSERT_RTNL();
294 		hash_del_rcu(&q->hash);
295 	}
296 }
297 EXPORT_SYMBOL(qdisc_hash_del);
298 
299 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
300 {
301 	struct Qdisc *q;
302 
303 	if (!handle)
304 		return NULL;
305 	q = qdisc_match_from_root(dev->qdisc, handle);
306 	if (q)
307 		goto out;
308 
309 	if (dev_ingress_queue(dev))
310 		q = qdisc_match_from_root(
311 			dev_ingress_queue(dev)->qdisc_sleeping,
312 			handle);
313 out:
314 	return q;
315 }
316 
317 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
318 {
319 	unsigned long cl;
320 	struct Qdisc *leaf;
321 	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
322 
323 	if (cops == NULL)
324 		return NULL;
325 	cl = cops->find(p, classid);
326 
327 	if (cl == 0)
328 		return NULL;
329 	leaf = cops->leaf(p, cl);
330 	return leaf;
331 }
332 
333 /* Find queueing discipline by name */
334 
335 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
336 {
337 	struct Qdisc_ops *q = NULL;
338 
339 	if (kind) {
340 		read_lock(&qdisc_mod_lock);
341 		for (q = qdisc_base; q; q = q->next) {
342 			if (nla_strcmp(kind, q->id) == 0) {
343 				if (!try_module_get(q->owner))
344 					q = NULL;
345 				break;
346 			}
347 		}
348 		read_unlock(&qdisc_mod_lock);
349 	}
350 	return q;
351 }
352 
353 /* The linklayer setting were not transferred from iproute2, in older
354  * versions, and the rate tables lookup systems have been dropped in
355  * the kernel. To keep backward compatible with older iproute2 tc
356  * utils, we detect the linklayer setting by detecting if the rate
357  * table were modified.
358  *
359  * For linklayer ATM table entries, the rate table will be aligned to
360  * 48 bytes, thus some table entries will contain the same value.  The
361  * mpu (min packet unit) is also encoded into the old rate table, thus
362  * starting from the mpu, we find low and high table entries for
363  * mapping this cell.  If these entries contain the same value, when
364  * the rate tables have been modified for linklayer ATM.
365  *
366  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
367  * and then roundup to the next cell, calc the table entry one below,
368  * and compare.
369  */
370 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
371 {
372 	int low       = roundup(r->mpu, 48);
373 	int high      = roundup(low+1, 48);
374 	int cell_low  = low >> r->cell_log;
375 	int cell_high = (high >> r->cell_log) - 1;
376 
377 	/* rtab is too inaccurate at rates > 100Mbit/s */
378 	if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
379 		pr_debug("TC linklayer: Giving up ATM detection\n");
380 		return TC_LINKLAYER_ETHERNET;
381 	}
382 
383 	if ((cell_high > cell_low) && (cell_high < 256)
384 	    && (rtab[cell_low] == rtab[cell_high])) {
385 		pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
386 			 cell_low, cell_high, rtab[cell_high]);
387 		return TC_LINKLAYER_ATM;
388 	}
389 	return TC_LINKLAYER_ETHERNET;
390 }
391 
392 static struct qdisc_rate_table *qdisc_rtab_list;
393 
394 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
395 					struct nlattr *tab,
396 					struct netlink_ext_ack *extack)
397 {
398 	struct qdisc_rate_table *rtab;
399 
400 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
401 	    nla_len(tab) != TC_RTAB_SIZE) {
402 		NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
403 		return NULL;
404 	}
405 
406 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
407 		if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
408 		    !memcmp(&rtab->data, nla_data(tab), 1024)) {
409 			rtab->refcnt++;
410 			return rtab;
411 		}
412 	}
413 
414 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
415 	if (rtab) {
416 		rtab->rate = *r;
417 		rtab->refcnt = 1;
418 		memcpy(rtab->data, nla_data(tab), 1024);
419 		if (r->linklayer == TC_LINKLAYER_UNAWARE)
420 			r->linklayer = __detect_linklayer(r, rtab->data);
421 		rtab->next = qdisc_rtab_list;
422 		qdisc_rtab_list = rtab;
423 	} else {
424 		NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
425 	}
426 	return rtab;
427 }
428 EXPORT_SYMBOL(qdisc_get_rtab);
429 
430 void qdisc_put_rtab(struct qdisc_rate_table *tab)
431 {
432 	struct qdisc_rate_table *rtab, **rtabp;
433 
434 	if (!tab || --tab->refcnt)
435 		return;
436 
437 	for (rtabp = &qdisc_rtab_list;
438 	     (rtab = *rtabp) != NULL;
439 	     rtabp = &rtab->next) {
440 		if (rtab == tab) {
441 			*rtabp = rtab->next;
442 			kfree(rtab);
443 			return;
444 		}
445 	}
446 }
447 EXPORT_SYMBOL(qdisc_put_rtab);
448 
449 static LIST_HEAD(qdisc_stab_list);
450 
451 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
452 	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
453 	[TCA_STAB_DATA] = { .type = NLA_BINARY },
454 };
455 
456 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
457 					       struct netlink_ext_ack *extack)
458 {
459 	struct nlattr *tb[TCA_STAB_MAX + 1];
460 	struct qdisc_size_table *stab;
461 	struct tc_sizespec *s;
462 	unsigned int tsize = 0;
463 	u16 *tab = NULL;
464 	int err;
465 
466 	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy, extack);
467 	if (err < 0)
468 		return ERR_PTR(err);
469 	if (!tb[TCA_STAB_BASE]) {
470 		NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
471 		return ERR_PTR(-EINVAL);
472 	}
473 
474 	s = nla_data(tb[TCA_STAB_BASE]);
475 
476 	if (s->tsize > 0) {
477 		if (!tb[TCA_STAB_DATA]) {
478 			NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
479 			return ERR_PTR(-EINVAL);
480 		}
481 		tab = nla_data(tb[TCA_STAB_DATA]);
482 		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
483 	}
484 
485 	if (tsize != s->tsize || (!tab && tsize > 0)) {
486 		NL_SET_ERR_MSG(extack, "Invalid size of size table");
487 		return ERR_PTR(-EINVAL);
488 	}
489 
490 	list_for_each_entry(stab, &qdisc_stab_list, list) {
491 		if (memcmp(&stab->szopts, s, sizeof(*s)))
492 			continue;
493 		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
494 			continue;
495 		stab->refcnt++;
496 		return stab;
497 	}
498 
499 	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
500 	if (!stab)
501 		return ERR_PTR(-ENOMEM);
502 
503 	stab->refcnt = 1;
504 	stab->szopts = *s;
505 	if (tsize > 0)
506 		memcpy(stab->data, tab, tsize * sizeof(u16));
507 
508 	list_add_tail(&stab->list, &qdisc_stab_list);
509 
510 	return stab;
511 }
512 
513 static void stab_kfree_rcu(struct rcu_head *head)
514 {
515 	kfree(container_of(head, struct qdisc_size_table, rcu));
516 }
517 
518 void qdisc_put_stab(struct qdisc_size_table *tab)
519 {
520 	if (!tab)
521 		return;
522 
523 	if (--tab->refcnt == 0) {
524 		list_del(&tab->list);
525 		call_rcu_bh(&tab->rcu, stab_kfree_rcu);
526 	}
527 }
528 EXPORT_SYMBOL(qdisc_put_stab);
529 
530 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
531 {
532 	struct nlattr *nest;
533 
534 	nest = nla_nest_start(skb, TCA_STAB);
535 	if (nest == NULL)
536 		goto nla_put_failure;
537 	if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
538 		goto nla_put_failure;
539 	nla_nest_end(skb, nest);
540 
541 	return skb->len;
542 
543 nla_put_failure:
544 	return -1;
545 }
546 
547 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
548 			       const struct qdisc_size_table *stab)
549 {
550 	int pkt_len, slot;
551 
552 	pkt_len = skb->len + stab->szopts.overhead;
553 	if (unlikely(!stab->szopts.tsize))
554 		goto out;
555 
556 	slot = pkt_len + stab->szopts.cell_align;
557 	if (unlikely(slot < 0))
558 		slot = 0;
559 
560 	slot >>= stab->szopts.cell_log;
561 	if (likely(slot < stab->szopts.tsize))
562 		pkt_len = stab->data[slot];
563 	else
564 		pkt_len = stab->data[stab->szopts.tsize - 1] *
565 				(slot / stab->szopts.tsize) +
566 				stab->data[slot % stab->szopts.tsize];
567 
568 	pkt_len <<= stab->szopts.size_log;
569 out:
570 	if (unlikely(pkt_len < 1))
571 		pkt_len = 1;
572 	qdisc_skb_cb(skb)->pkt_len = pkt_len;
573 }
574 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
575 
576 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
577 {
578 	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
579 		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
580 			txt, qdisc->ops->id, qdisc->handle >> 16);
581 		qdisc->flags |= TCQ_F_WARN_NONWC;
582 	}
583 }
584 EXPORT_SYMBOL(qdisc_warn_nonwc);
585 
586 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
587 {
588 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
589 						 timer);
590 
591 	rcu_read_lock();
592 	__netif_schedule(qdisc_root(wd->qdisc));
593 	rcu_read_unlock();
594 
595 	return HRTIMER_NORESTART;
596 }
597 
598 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
599 				 clockid_t clockid)
600 {
601 	hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
602 	wd->timer.function = qdisc_watchdog;
603 	wd->qdisc = qdisc;
604 }
605 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
606 
607 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
608 {
609 	qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
610 }
611 EXPORT_SYMBOL(qdisc_watchdog_init);
612 
613 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
614 {
615 	if (test_bit(__QDISC_STATE_DEACTIVATED,
616 		     &qdisc_root_sleeping(wd->qdisc)->state))
617 		return;
618 
619 	if (wd->last_expires == expires)
620 		return;
621 
622 	wd->last_expires = expires;
623 	hrtimer_start(&wd->timer,
624 		      ns_to_ktime(expires),
625 		      HRTIMER_MODE_ABS_PINNED);
626 }
627 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
628 
629 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
630 {
631 	hrtimer_cancel(&wd->timer);
632 }
633 EXPORT_SYMBOL(qdisc_watchdog_cancel);
634 
635 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
636 {
637 	struct hlist_head *h;
638 	unsigned int i;
639 
640 	h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
641 
642 	if (h != NULL) {
643 		for (i = 0; i < n; i++)
644 			INIT_HLIST_HEAD(&h[i]);
645 	}
646 	return h;
647 }
648 
649 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
650 {
651 	struct Qdisc_class_common *cl;
652 	struct hlist_node *next;
653 	struct hlist_head *nhash, *ohash;
654 	unsigned int nsize, nmask, osize;
655 	unsigned int i, h;
656 
657 	/* Rehash when load factor exceeds 0.75 */
658 	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
659 		return;
660 	nsize = clhash->hashsize * 2;
661 	nmask = nsize - 1;
662 	nhash = qdisc_class_hash_alloc(nsize);
663 	if (nhash == NULL)
664 		return;
665 
666 	ohash = clhash->hash;
667 	osize = clhash->hashsize;
668 
669 	sch_tree_lock(sch);
670 	for (i = 0; i < osize; i++) {
671 		hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
672 			h = qdisc_class_hash(cl->classid, nmask);
673 			hlist_add_head(&cl->hnode, &nhash[h]);
674 		}
675 	}
676 	clhash->hash     = nhash;
677 	clhash->hashsize = nsize;
678 	clhash->hashmask = nmask;
679 	sch_tree_unlock(sch);
680 
681 	kvfree(ohash);
682 }
683 EXPORT_SYMBOL(qdisc_class_hash_grow);
684 
685 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
686 {
687 	unsigned int size = 4;
688 
689 	clhash->hash = qdisc_class_hash_alloc(size);
690 	if (!clhash->hash)
691 		return -ENOMEM;
692 	clhash->hashsize  = size;
693 	clhash->hashmask  = size - 1;
694 	clhash->hashelems = 0;
695 	return 0;
696 }
697 EXPORT_SYMBOL(qdisc_class_hash_init);
698 
699 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
700 {
701 	kvfree(clhash->hash);
702 }
703 EXPORT_SYMBOL(qdisc_class_hash_destroy);
704 
705 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
706 			     struct Qdisc_class_common *cl)
707 {
708 	unsigned int h;
709 
710 	INIT_HLIST_NODE(&cl->hnode);
711 	h = qdisc_class_hash(cl->classid, clhash->hashmask);
712 	hlist_add_head(&cl->hnode, &clhash->hash[h]);
713 	clhash->hashelems++;
714 }
715 EXPORT_SYMBOL(qdisc_class_hash_insert);
716 
717 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
718 			     struct Qdisc_class_common *cl)
719 {
720 	hlist_del(&cl->hnode);
721 	clhash->hashelems--;
722 }
723 EXPORT_SYMBOL(qdisc_class_hash_remove);
724 
725 /* Allocate an unique handle from space managed by kernel
726  * Possible range is [8000-FFFF]:0000 (0x8000 values)
727  */
728 static u32 qdisc_alloc_handle(struct net_device *dev)
729 {
730 	int i = 0x8000;
731 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
732 
733 	do {
734 		autohandle += TC_H_MAKE(0x10000U, 0);
735 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
736 			autohandle = TC_H_MAKE(0x80000000U, 0);
737 		if (!qdisc_lookup(dev, autohandle))
738 			return autohandle;
739 		cond_resched();
740 	} while	(--i > 0);
741 
742 	return 0;
743 }
744 
745 void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
746 			       unsigned int len)
747 {
748 	bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
749 	const struct Qdisc_class_ops *cops;
750 	unsigned long cl;
751 	u32 parentid;
752 	bool notify;
753 	int drops;
754 
755 	if (n == 0 && len == 0)
756 		return;
757 	drops = max_t(int, n, 0);
758 	rcu_read_lock();
759 	while ((parentid = sch->parent)) {
760 		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
761 			break;
762 
763 		if (sch->flags & TCQ_F_NOPARENT)
764 			break;
765 		/* Notify parent qdisc only if child qdisc becomes empty.
766 		 *
767 		 * If child was empty even before update then backlog
768 		 * counter is screwed and we skip notification because
769 		 * parent class is already passive.
770 		 *
771 		 * If the original child was offloaded then it is allowed
772 		 * to be seem as empty, so the parent is notified anyway.
773 		 */
774 		notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
775 						       !qdisc_is_offloaded);
776 		/* TODO: perform the search on a per txq basis */
777 		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
778 		if (sch == NULL) {
779 			WARN_ON_ONCE(parentid != TC_H_ROOT);
780 			break;
781 		}
782 		cops = sch->ops->cl_ops;
783 		if (notify && cops->qlen_notify) {
784 			cl = cops->find(sch, parentid);
785 			cops->qlen_notify(sch, cl);
786 		}
787 		sch->q.qlen -= n;
788 		sch->qstats.backlog -= len;
789 		__qdisc_qstats_drop(sch, drops);
790 	}
791 	rcu_read_unlock();
792 }
793 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
794 
795 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
796 			 u32 portid, u32 seq, u16 flags, int event)
797 {
798 	struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
799 	struct gnet_stats_queue __percpu *cpu_qstats = NULL;
800 	struct tcmsg *tcm;
801 	struct nlmsghdr  *nlh;
802 	unsigned char *b = skb_tail_pointer(skb);
803 	struct gnet_dump d;
804 	struct qdisc_size_table *stab;
805 	u32 block_index;
806 	__u32 qlen;
807 
808 	cond_resched();
809 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
810 	if (!nlh)
811 		goto out_nlmsg_trim;
812 	tcm = nlmsg_data(nlh);
813 	tcm->tcm_family = AF_UNSPEC;
814 	tcm->tcm__pad1 = 0;
815 	tcm->tcm__pad2 = 0;
816 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
817 	tcm->tcm_parent = clid;
818 	tcm->tcm_handle = q->handle;
819 	tcm->tcm_info = refcount_read(&q->refcnt);
820 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
821 		goto nla_put_failure;
822 	if (q->ops->ingress_block_get) {
823 		block_index = q->ops->ingress_block_get(q);
824 		if (block_index &&
825 		    nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
826 			goto nla_put_failure;
827 	}
828 	if (q->ops->egress_block_get) {
829 		block_index = q->ops->egress_block_get(q);
830 		if (block_index &&
831 		    nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
832 			goto nla_put_failure;
833 	}
834 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
835 		goto nla_put_failure;
836 	if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
837 		goto nla_put_failure;
838 	qlen = qdisc_qlen_sum(q);
839 
840 	stab = rtnl_dereference(q->stab);
841 	if (stab && qdisc_dump_stab(skb, stab) < 0)
842 		goto nla_put_failure;
843 
844 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
845 					 NULL, &d, TCA_PAD) < 0)
846 		goto nla_put_failure;
847 
848 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
849 		goto nla_put_failure;
850 
851 	if (qdisc_is_percpu_stats(q)) {
852 		cpu_bstats = q->cpu_bstats;
853 		cpu_qstats = q->cpu_qstats;
854 	}
855 
856 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
857 				  &d, cpu_bstats, &q->bstats) < 0 ||
858 	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
859 	    gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
860 		goto nla_put_failure;
861 
862 	if (gnet_stats_finish_copy(&d) < 0)
863 		goto nla_put_failure;
864 
865 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
866 	return skb->len;
867 
868 out_nlmsg_trim:
869 nla_put_failure:
870 	nlmsg_trim(skb, b);
871 	return -1;
872 }
873 
874 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
875 {
876 	if (q->flags & TCQ_F_BUILTIN)
877 		return true;
878 	if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
879 		return true;
880 
881 	return false;
882 }
883 
884 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
885 			struct nlmsghdr *n, u32 clid,
886 			struct Qdisc *old, struct Qdisc *new)
887 {
888 	struct sk_buff *skb;
889 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
890 
891 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
892 	if (!skb)
893 		return -ENOBUFS;
894 
895 	if (old && !tc_qdisc_dump_ignore(old, false)) {
896 		if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
897 				  0, RTM_DELQDISC) < 0)
898 			goto err_out;
899 	}
900 	if (new && !tc_qdisc_dump_ignore(new, false)) {
901 		if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
902 				  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
903 			goto err_out;
904 	}
905 
906 	if (skb->len)
907 		return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
908 				      n->nlmsg_flags & NLM_F_ECHO);
909 
910 err_out:
911 	kfree_skb(skb);
912 	return -EINVAL;
913 }
914 
915 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
916 			       struct nlmsghdr *n, u32 clid,
917 			       struct Qdisc *old, struct Qdisc *new)
918 {
919 	if (new || old)
920 		qdisc_notify(net, skb, n, clid, old, new);
921 
922 	if (old)
923 		qdisc_destroy(old);
924 }
925 
926 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
927  * to device "dev".
928  *
929  * When appropriate send a netlink notification using 'skb'
930  * and "n".
931  *
932  * On success, destroy old qdisc.
933  */
934 
935 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
936 		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
937 		       struct Qdisc *new, struct Qdisc *old,
938 		       struct netlink_ext_ack *extack)
939 {
940 	struct Qdisc *q = old;
941 	struct net *net = dev_net(dev);
942 	int err = 0;
943 
944 	if (parent == NULL) {
945 		unsigned int i, num_q, ingress;
946 
947 		ingress = 0;
948 		num_q = dev->num_tx_queues;
949 		if ((q && q->flags & TCQ_F_INGRESS) ||
950 		    (new && new->flags & TCQ_F_INGRESS)) {
951 			num_q = 1;
952 			ingress = 1;
953 			if (!dev_ingress_queue(dev)) {
954 				NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
955 				return -ENOENT;
956 			}
957 		}
958 
959 		if (dev->flags & IFF_UP)
960 			dev_deactivate(dev);
961 
962 		if (new && new->ops->attach)
963 			goto skip;
964 
965 		for (i = 0; i < num_q; i++) {
966 			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
967 
968 			if (!ingress)
969 				dev_queue = netdev_get_tx_queue(dev, i);
970 
971 			old = dev_graft_qdisc(dev_queue, new);
972 			if (new && i > 0)
973 				qdisc_refcount_inc(new);
974 
975 			if (!ingress)
976 				qdisc_destroy(old);
977 		}
978 
979 skip:
980 		if (!ingress) {
981 			notify_and_destroy(net, skb, n, classid,
982 					   dev->qdisc, new);
983 			if (new && !new->ops->attach)
984 				qdisc_refcount_inc(new);
985 			dev->qdisc = new ? : &noop_qdisc;
986 
987 			if (new && new->ops->attach)
988 				new->ops->attach(new);
989 		} else {
990 			notify_and_destroy(net, skb, n, classid, old, new);
991 		}
992 
993 		if (dev->flags & IFF_UP)
994 			dev_activate(dev);
995 	} else {
996 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
997 
998 		/* Only support running class lockless if parent is lockless */
999 		if (new && (new->flags & TCQ_F_NOLOCK) &&
1000 		    parent && !(parent->flags & TCQ_F_NOLOCK))
1001 			new->flags &= ~TCQ_F_NOLOCK;
1002 
1003 		err = -EOPNOTSUPP;
1004 		if (cops && cops->graft) {
1005 			unsigned long cl = cops->find(parent, classid);
1006 
1007 			if (cl) {
1008 				err = cops->graft(parent, cl, new, &old,
1009 						  extack);
1010 			} else {
1011 				NL_SET_ERR_MSG(extack, "Specified class not found");
1012 				err = -ENOENT;
1013 			}
1014 		}
1015 		if (!err)
1016 			notify_and_destroy(net, skb, n, classid, old, new);
1017 	}
1018 	return err;
1019 }
1020 
1021 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1022 				   struct netlink_ext_ack *extack)
1023 {
1024 	u32 block_index;
1025 
1026 	if (tca[TCA_INGRESS_BLOCK]) {
1027 		block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1028 
1029 		if (!block_index) {
1030 			NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1031 			return -EINVAL;
1032 		}
1033 		if (!sch->ops->ingress_block_set) {
1034 			NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1035 			return -EOPNOTSUPP;
1036 		}
1037 		sch->ops->ingress_block_set(sch, block_index);
1038 	}
1039 	if (tca[TCA_EGRESS_BLOCK]) {
1040 		block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1041 
1042 		if (!block_index) {
1043 			NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1044 			return -EINVAL;
1045 		}
1046 		if (!sch->ops->egress_block_set) {
1047 			NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1048 			return -EOPNOTSUPP;
1049 		}
1050 		sch->ops->egress_block_set(sch, block_index);
1051 	}
1052 	return 0;
1053 }
1054 
1055 /*
1056    Allocate and initialize new qdisc.
1057 
1058    Parameters are passed via opt.
1059  */
1060 
1061 static struct Qdisc *qdisc_create(struct net_device *dev,
1062 				  struct netdev_queue *dev_queue,
1063 				  struct Qdisc *p, u32 parent, u32 handle,
1064 				  struct nlattr **tca, int *errp,
1065 				  struct netlink_ext_ack *extack)
1066 {
1067 	int err;
1068 	struct nlattr *kind = tca[TCA_KIND];
1069 	struct Qdisc *sch;
1070 	struct Qdisc_ops *ops;
1071 	struct qdisc_size_table *stab;
1072 
1073 	ops = qdisc_lookup_ops(kind);
1074 #ifdef CONFIG_MODULES
1075 	if (ops == NULL && kind != NULL) {
1076 		char name[IFNAMSIZ];
1077 		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1078 			/* We dropped the RTNL semaphore in order to
1079 			 * perform the module load.  So, even if we
1080 			 * succeeded in loading the module we have to
1081 			 * tell the caller to replay the request.  We
1082 			 * indicate this using -EAGAIN.
1083 			 * We replay the request because the device may
1084 			 * go away in the mean time.
1085 			 */
1086 			rtnl_unlock();
1087 			request_module("sch_%s", name);
1088 			rtnl_lock();
1089 			ops = qdisc_lookup_ops(kind);
1090 			if (ops != NULL) {
1091 				/* We will try again qdisc_lookup_ops,
1092 				 * so don't keep a reference.
1093 				 */
1094 				module_put(ops->owner);
1095 				err = -EAGAIN;
1096 				goto err_out;
1097 			}
1098 		}
1099 	}
1100 #endif
1101 
1102 	err = -ENOENT;
1103 	if (!ops) {
1104 		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1105 		goto err_out;
1106 	}
1107 
1108 	sch = qdisc_alloc(dev_queue, ops, extack);
1109 	if (IS_ERR(sch)) {
1110 		err = PTR_ERR(sch);
1111 		goto err_out2;
1112 	}
1113 
1114 	sch->parent = parent;
1115 
1116 	if (handle == TC_H_INGRESS) {
1117 		sch->flags |= TCQ_F_INGRESS;
1118 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
1119 	} else {
1120 		if (handle == 0) {
1121 			handle = qdisc_alloc_handle(dev);
1122 			err = -ENOMEM;
1123 			if (handle == 0)
1124 				goto err_out3;
1125 		}
1126 		if (!netif_is_multiqueue(dev))
1127 			sch->flags |= TCQ_F_ONETXQUEUE;
1128 	}
1129 
1130 	sch->handle = handle;
1131 
1132 	/* This exist to keep backward compatible with a userspace
1133 	 * loophole, what allowed userspace to get IFF_NO_QUEUE
1134 	 * facility on older kernels by setting tx_queue_len=0 (prior
1135 	 * to qdisc init), and then forgot to reinit tx_queue_len
1136 	 * before again attaching a qdisc.
1137 	 */
1138 	if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1139 		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1140 		netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1141 	}
1142 
1143 	err = qdisc_block_indexes_set(sch, tca, extack);
1144 	if (err)
1145 		goto err_out3;
1146 
1147 	if (ops->init) {
1148 		err = ops->init(sch, tca[TCA_OPTIONS], extack);
1149 		if (err != 0)
1150 			goto err_out5;
1151 	}
1152 
1153 	if (tca[TCA_STAB]) {
1154 		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1155 		if (IS_ERR(stab)) {
1156 			err = PTR_ERR(stab);
1157 			goto err_out4;
1158 		}
1159 		rcu_assign_pointer(sch->stab, stab);
1160 	}
1161 	if (tca[TCA_RATE]) {
1162 		seqcount_t *running;
1163 
1164 		err = -EOPNOTSUPP;
1165 		if (sch->flags & TCQ_F_MQROOT) {
1166 			NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1167 			goto err_out4;
1168 		}
1169 
1170 		if (sch->parent != TC_H_ROOT &&
1171 		    !(sch->flags & TCQ_F_INGRESS) &&
1172 		    (!p || !(p->flags & TCQ_F_MQROOT)))
1173 			running = qdisc_root_sleeping_running(sch);
1174 		else
1175 			running = &sch->running;
1176 
1177 		err = gen_new_estimator(&sch->bstats,
1178 					sch->cpu_bstats,
1179 					&sch->rate_est,
1180 					NULL,
1181 					running,
1182 					tca[TCA_RATE]);
1183 		if (err) {
1184 			NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1185 			goto err_out4;
1186 		}
1187 	}
1188 
1189 	qdisc_hash_add(sch, false);
1190 
1191 	return sch;
1192 
1193 err_out5:
1194 	/* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1195 	if (ops->destroy)
1196 		ops->destroy(sch);
1197 err_out3:
1198 	dev_put(dev);
1199 	qdisc_free(sch);
1200 err_out2:
1201 	module_put(ops->owner);
1202 err_out:
1203 	*errp = err;
1204 	return NULL;
1205 
1206 err_out4:
1207 	/*
1208 	 * Any broken qdiscs that would require a ops->reset() here?
1209 	 * The qdisc was never in action so it shouldn't be necessary.
1210 	 */
1211 	qdisc_put_stab(rtnl_dereference(sch->stab));
1212 	if (ops->destroy)
1213 		ops->destroy(sch);
1214 	goto err_out3;
1215 }
1216 
1217 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1218 			struct netlink_ext_ack *extack)
1219 {
1220 	struct qdisc_size_table *ostab, *stab = NULL;
1221 	int err = 0;
1222 
1223 	if (tca[TCA_OPTIONS]) {
1224 		if (!sch->ops->change) {
1225 			NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1226 			return -EINVAL;
1227 		}
1228 		if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1229 			NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1230 			return -EOPNOTSUPP;
1231 		}
1232 		err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1233 		if (err)
1234 			return err;
1235 	}
1236 
1237 	if (tca[TCA_STAB]) {
1238 		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1239 		if (IS_ERR(stab))
1240 			return PTR_ERR(stab);
1241 	}
1242 
1243 	ostab = rtnl_dereference(sch->stab);
1244 	rcu_assign_pointer(sch->stab, stab);
1245 	qdisc_put_stab(ostab);
1246 
1247 	if (tca[TCA_RATE]) {
1248 		/* NB: ignores errors from replace_estimator
1249 		   because change can't be undone. */
1250 		if (sch->flags & TCQ_F_MQROOT)
1251 			goto out;
1252 		gen_replace_estimator(&sch->bstats,
1253 				      sch->cpu_bstats,
1254 				      &sch->rate_est,
1255 				      NULL,
1256 				      qdisc_root_sleeping_running(sch),
1257 				      tca[TCA_RATE]);
1258 	}
1259 out:
1260 	return 0;
1261 }
1262 
1263 struct check_loop_arg {
1264 	struct qdisc_walker	w;
1265 	struct Qdisc		*p;
1266 	int			depth;
1267 };
1268 
1269 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1270 			 struct qdisc_walker *w);
1271 
1272 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1273 {
1274 	struct check_loop_arg	arg;
1275 
1276 	if (q->ops->cl_ops == NULL)
1277 		return 0;
1278 
1279 	arg.w.stop = arg.w.skip = arg.w.count = 0;
1280 	arg.w.fn = check_loop_fn;
1281 	arg.depth = depth;
1282 	arg.p = p;
1283 	q->ops->cl_ops->walk(q, &arg.w);
1284 	return arg.w.stop ? -ELOOP : 0;
1285 }
1286 
1287 static int
1288 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1289 {
1290 	struct Qdisc *leaf;
1291 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1292 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
1293 
1294 	leaf = cops->leaf(q, cl);
1295 	if (leaf) {
1296 		if (leaf == arg->p || arg->depth > 7)
1297 			return -ELOOP;
1298 		return check_loop(leaf, arg->p, arg->depth + 1);
1299 	}
1300 	return 0;
1301 }
1302 
1303 /*
1304  * Delete/get qdisc.
1305  */
1306 
1307 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1308 			struct netlink_ext_ack *extack)
1309 {
1310 	struct net *net = sock_net(skb->sk);
1311 	struct tcmsg *tcm = nlmsg_data(n);
1312 	struct nlattr *tca[TCA_MAX + 1];
1313 	struct net_device *dev;
1314 	u32 clid;
1315 	struct Qdisc *q = NULL;
1316 	struct Qdisc *p = NULL;
1317 	int err;
1318 
1319 	if ((n->nlmsg_type != RTM_GETQDISC) &&
1320 	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1321 		return -EPERM;
1322 
1323 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
1324 	if (err < 0)
1325 		return err;
1326 
1327 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1328 	if (!dev)
1329 		return -ENODEV;
1330 
1331 	clid = tcm->tcm_parent;
1332 	if (clid) {
1333 		if (clid != TC_H_ROOT) {
1334 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1335 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1336 				if (!p) {
1337 					NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1338 					return -ENOENT;
1339 				}
1340 				q = qdisc_leaf(p, clid);
1341 			} else if (dev_ingress_queue(dev)) {
1342 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1343 			}
1344 		} else {
1345 			q = dev->qdisc;
1346 		}
1347 		if (!q) {
1348 			NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1349 			return -ENOENT;
1350 		}
1351 
1352 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1353 			NL_SET_ERR_MSG(extack, "Invalid handle");
1354 			return -EINVAL;
1355 		}
1356 	} else {
1357 		q = qdisc_lookup(dev, tcm->tcm_handle);
1358 		if (!q) {
1359 			NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1360 			return -ENOENT;
1361 		}
1362 	}
1363 
1364 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1365 		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1366 		return -EINVAL;
1367 	}
1368 
1369 	if (n->nlmsg_type == RTM_DELQDISC) {
1370 		if (!clid) {
1371 			NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1372 			return -EINVAL;
1373 		}
1374 		if (q->handle == 0) {
1375 			NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1376 			return -ENOENT;
1377 		}
1378 		err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1379 		if (err != 0)
1380 			return err;
1381 	} else {
1382 		qdisc_notify(net, skb, n, clid, NULL, q);
1383 	}
1384 	return 0;
1385 }
1386 
1387 /*
1388  * Create/change qdisc.
1389  */
1390 
1391 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1392 			   struct netlink_ext_ack *extack)
1393 {
1394 	struct net *net = sock_net(skb->sk);
1395 	struct tcmsg *tcm;
1396 	struct nlattr *tca[TCA_MAX + 1];
1397 	struct net_device *dev;
1398 	u32 clid;
1399 	struct Qdisc *q, *p;
1400 	int err;
1401 
1402 	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1403 		return -EPERM;
1404 
1405 replay:
1406 	/* Reinit, just in case something touches this. */
1407 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
1408 	if (err < 0)
1409 		return err;
1410 
1411 	tcm = nlmsg_data(n);
1412 	clid = tcm->tcm_parent;
1413 	q = p = NULL;
1414 
1415 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1416 	if (!dev)
1417 		return -ENODEV;
1418 
1419 
1420 	if (clid) {
1421 		if (clid != TC_H_ROOT) {
1422 			if (clid != TC_H_INGRESS) {
1423 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1424 				if (!p) {
1425 					NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1426 					return -ENOENT;
1427 				}
1428 				q = qdisc_leaf(p, clid);
1429 			} else if (dev_ingress_queue_create(dev)) {
1430 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1431 			}
1432 		} else {
1433 			q = dev->qdisc;
1434 		}
1435 
1436 		/* It may be default qdisc, ignore it */
1437 		if (q && q->handle == 0)
1438 			q = NULL;
1439 
1440 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1441 			if (tcm->tcm_handle) {
1442 				if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1443 					NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1444 					return -EEXIST;
1445 				}
1446 				if (TC_H_MIN(tcm->tcm_handle)) {
1447 					NL_SET_ERR_MSG(extack, "Invalid minor handle");
1448 					return -EINVAL;
1449 				}
1450 				q = qdisc_lookup(dev, tcm->tcm_handle);
1451 				if (!q)
1452 					goto create_n_graft;
1453 				if (n->nlmsg_flags & NLM_F_EXCL) {
1454 					NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1455 					return -EEXIST;
1456 				}
1457 				if (tca[TCA_KIND] &&
1458 				    nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1459 					NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1460 					return -EINVAL;
1461 				}
1462 				if (q == p ||
1463 				    (p && check_loop(q, p, 0))) {
1464 					NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1465 					return -ELOOP;
1466 				}
1467 				qdisc_refcount_inc(q);
1468 				goto graft;
1469 			} else {
1470 				if (!q)
1471 					goto create_n_graft;
1472 
1473 				/* This magic test requires explanation.
1474 				 *
1475 				 *   We know, that some child q is already
1476 				 *   attached to this parent and have choice:
1477 				 *   either to change it or to create/graft new one.
1478 				 *
1479 				 *   1. We are allowed to create/graft only
1480 				 *   if CREATE and REPLACE flags are set.
1481 				 *
1482 				 *   2. If EXCL is set, requestor wanted to say,
1483 				 *   that qdisc tcm_handle is not expected
1484 				 *   to exist, so that we choose create/graft too.
1485 				 *
1486 				 *   3. The last case is when no flags are set.
1487 				 *   Alas, it is sort of hole in API, we
1488 				 *   cannot decide what to do unambiguously.
1489 				 *   For now we select create/graft, if
1490 				 *   user gave KIND, which does not match existing.
1491 				 */
1492 				if ((n->nlmsg_flags & NLM_F_CREATE) &&
1493 				    (n->nlmsg_flags & NLM_F_REPLACE) &&
1494 				    ((n->nlmsg_flags & NLM_F_EXCL) ||
1495 				     (tca[TCA_KIND] &&
1496 				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1497 					goto create_n_graft;
1498 			}
1499 		}
1500 	} else {
1501 		if (!tcm->tcm_handle) {
1502 			NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1503 			return -EINVAL;
1504 		}
1505 		q = qdisc_lookup(dev, tcm->tcm_handle);
1506 	}
1507 
1508 	/* Change qdisc parameters */
1509 	if (!q) {
1510 		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1511 		return -ENOENT;
1512 	}
1513 	if (n->nlmsg_flags & NLM_F_EXCL) {
1514 		NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1515 		return -EEXIST;
1516 	}
1517 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1518 		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1519 		return -EINVAL;
1520 	}
1521 	err = qdisc_change(q, tca, extack);
1522 	if (err == 0)
1523 		qdisc_notify(net, skb, n, clid, NULL, q);
1524 	return err;
1525 
1526 create_n_graft:
1527 	if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1528 		NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1529 		return -ENOENT;
1530 	}
1531 	if (clid == TC_H_INGRESS) {
1532 		if (dev_ingress_queue(dev)) {
1533 			q = qdisc_create(dev, dev_ingress_queue(dev), p,
1534 					 tcm->tcm_parent, tcm->tcm_parent,
1535 					 tca, &err, extack);
1536 		} else {
1537 			NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1538 			err = -ENOENT;
1539 		}
1540 	} else {
1541 		struct netdev_queue *dev_queue;
1542 
1543 		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1544 			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1545 		else if (p)
1546 			dev_queue = p->dev_queue;
1547 		else
1548 			dev_queue = netdev_get_tx_queue(dev, 0);
1549 
1550 		q = qdisc_create(dev, dev_queue, p,
1551 				 tcm->tcm_parent, tcm->tcm_handle,
1552 				 tca, &err, extack);
1553 	}
1554 	if (q == NULL) {
1555 		if (err == -EAGAIN)
1556 			goto replay;
1557 		return err;
1558 	}
1559 
1560 graft:
1561 	err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1562 	if (err) {
1563 		if (q)
1564 			qdisc_destroy(q);
1565 		return err;
1566 	}
1567 
1568 	return 0;
1569 }
1570 
1571 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1572 			      struct netlink_callback *cb,
1573 			      int *q_idx_p, int s_q_idx, bool recur,
1574 			      bool dump_invisible)
1575 {
1576 	int ret = 0, q_idx = *q_idx_p;
1577 	struct Qdisc *q;
1578 	int b;
1579 
1580 	if (!root)
1581 		return 0;
1582 
1583 	q = root;
1584 	if (q_idx < s_q_idx) {
1585 		q_idx++;
1586 	} else {
1587 		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1588 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1589 				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1590 				  RTM_NEWQDISC) <= 0)
1591 			goto done;
1592 		q_idx++;
1593 	}
1594 
1595 	/* If dumping singletons, there is no qdisc_dev(root) and the singleton
1596 	 * itself has already been dumped.
1597 	 *
1598 	 * If we've already dumped the top-level (ingress) qdisc above and the global
1599 	 * qdisc hashtable, we don't want to hit it again
1600 	 */
1601 	if (!qdisc_dev(root) || !recur)
1602 		goto out;
1603 
1604 	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1605 		if (q_idx < s_q_idx) {
1606 			q_idx++;
1607 			continue;
1608 		}
1609 		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1610 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1611 				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1612 				  RTM_NEWQDISC) <= 0)
1613 			goto done;
1614 		q_idx++;
1615 	}
1616 
1617 out:
1618 	*q_idx_p = q_idx;
1619 	return ret;
1620 done:
1621 	ret = -1;
1622 	goto out;
1623 }
1624 
1625 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1626 {
1627 	struct net *net = sock_net(skb->sk);
1628 	int idx, q_idx;
1629 	int s_idx, s_q_idx;
1630 	struct net_device *dev;
1631 	const struct nlmsghdr *nlh = cb->nlh;
1632 	struct nlattr *tca[TCA_MAX + 1];
1633 	int err;
1634 
1635 	s_idx = cb->args[0];
1636 	s_q_idx = q_idx = cb->args[1];
1637 
1638 	idx = 0;
1639 	ASSERT_RTNL();
1640 
1641 	err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX, NULL, NULL);
1642 	if (err < 0)
1643 		return err;
1644 
1645 	for_each_netdev(net, dev) {
1646 		struct netdev_queue *dev_queue;
1647 
1648 		if (idx < s_idx)
1649 			goto cont;
1650 		if (idx > s_idx)
1651 			s_q_idx = 0;
1652 		q_idx = 0;
1653 
1654 		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1655 				       true, tca[TCA_DUMP_INVISIBLE]) < 0)
1656 			goto done;
1657 
1658 		dev_queue = dev_ingress_queue(dev);
1659 		if (dev_queue &&
1660 		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1661 				       &q_idx, s_q_idx, false,
1662 				       tca[TCA_DUMP_INVISIBLE]) < 0)
1663 			goto done;
1664 
1665 cont:
1666 		idx++;
1667 	}
1668 
1669 done:
1670 	cb->args[0] = idx;
1671 	cb->args[1] = q_idx;
1672 
1673 	return skb->len;
1674 }
1675 
1676 
1677 
1678 /************************************************
1679  *	Traffic classes manipulation.		*
1680  ************************************************/
1681 
1682 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1683 			  unsigned long cl,
1684 			  u32 portid, u32 seq, u16 flags, int event)
1685 {
1686 	struct tcmsg *tcm;
1687 	struct nlmsghdr  *nlh;
1688 	unsigned char *b = skb_tail_pointer(skb);
1689 	struct gnet_dump d;
1690 	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1691 
1692 	cond_resched();
1693 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1694 	if (!nlh)
1695 		goto out_nlmsg_trim;
1696 	tcm = nlmsg_data(nlh);
1697 	tcm->tcm_family = AF_UNSPEC;
1698 	tcm->tcm__pad1 = 0;
1699 	tcm->tcm__pad2 = 0;
1700 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1701 	tcm->tcm_parent = q->handle;
1702 	tcm->tcm_handle = q->handle;
1703 	tcm->tcm_info = 0;
1704 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
1705 		goto nla_put_failure;
1706 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1707 		goto nla_put_failure;
1708 
1709 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1710 					 NULL, &d, TCA_PAD) < 0)
1711 		goto nla_put_failure;
1712 
1713 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1714 		goto nla_put_failure;
1715 
1716 	if (gnet_stats_finish_copy(&d) < 0)
1717 		goto nla_put_failure;
1718 
1719 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1720 	return skb->len;
1721 
1722 out_nlmsg_trim:
1723 nla_put_failure:
1724 	nlmsg_trim(skb, b);
1725 	return -1;
1726 }
1727 
1728 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1729 			 struct nlmsghdr *n, struct Qdisc *q,
1730 			 unsigned long cl, int event)
1731 {
1732 	struct sk_buff *skb;
1733 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1734 
1735 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1736 	if (!skb)
1737 		return -ENOBUFS;
1738 
1739 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1740 		kfree_skb(skb);
1741 		return -EINVAL;
1742 	}
1743 
1744 	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1745 			      n->nlmsg_flags & NLM_F_ECHO);
1746 }
1747 
1748 static int tclass_del_notify(struct net *net,
1749 			     const struct Qdisc_class_ops *cops,
1750 			     struct sk_buff *oskb, struct nlmsghdr *n,
1751 			     struct Qdisc *q, unsigned long cl)
1752 {
1753 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1754 	struct sk_buff *skb;
1755 	int err = 0;
1756 
1757 	if (!cops->delete)
1758 		return -EOPNOTSUPP;
1759 
1760 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1761 	if (!skb)
1762 		return -ENOBUFS;
1763 
1764 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1765 			   RTM_DELTCLASS) < 0) {
1766 		kfree_skb(skb);
1767 		return -EINVAL;
1768 	}
1769 
1770 	err = cops->delete(q, cl);
1771 	if (err) {
1772 		kfree_skb(skb);
1773 		return err;
1774 	}
1775 
1776 	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1777 			      n->nlmsg_flags & NLM_F_ECHO);
1778 }
1779 
1780 #ifdef CONFIG_NET_CLS
1781 
1782 struct tcf_bind_args {
1783 	struct tcf_walker w;
1784 	u32 classid;
1785 	unsigned long cl;
1786 };
1787 
1788 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1789 {
1790 	struct tcf_bind_args *a = (void *)arg;
1791 
1792 	if (tp->ops->bind_class) {
1793 		struct Qdisc *q = tcf_block_q(tp->chain->block);
1794 
1795 		sch_tree_lock(q);
1796 		tp->ops->bind_class(n, a->classid, a->cl);
1797 		sch_tree_unlock(q);
1798 	}
1799 	return 0;
1800 }
1801 
1802 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1803 			   unsigned long new_cl)
1804 {
1805 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1806 	struct tcf_block *block;
1807 	struct tcf_chain *chain;
1808 	unsigned long cl;
1809 
1810 	cl = cops->find(q, portid);
1811 	if (!cl)
1812 		return;
1813 	block = cops->tcf_block(q, cl, NULL);
1814 	if (!block)
1815 		return;
1816 	list_for_each_entry(chain, &block->chain_list, list) {
1817 		struct tcf_proto *tp;
1818 
1819 		for (tp = rtnl_dereference(chain->filter_chain);
1820 		     tp; tp = rtnl_dereference(tp->next)) {
1821 			struct tcf_bind_args arg = {};
1822 
1823 			arg.w.fn = tcf_node_bind;
1824 			arg.classid = clid;
1825 			arg.cl = new_cl;
1826 			tp->ops->walk(tp, &arg.w);
1827 		}
1828 	}
1829 }
1830 
1831 #else
1832 
1833 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1834 			   unsigned long new_cl)
1835 {
1836 }
1837 
1838 #endif
1839 
1840 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1841 			 struct netlink_ext_ack *extack)
1842 {
1843 	struct net *net = sock_net(skb->sk);
1844 	struct tcmsg *tcm = nlmsg_data(n);
1845 	struct nlattr *tca[TCA_MAX + 1];
1846 	struct net_device *dev;
1847 	struct Qdisc *q = NULL;
1848 	const struct Qdisc_class_ops *cops;
1849 	unsigned long cl = 0;
1850 	unsigned long new_cl;
1851 	u32 portid;
1852 	u32 clid;
1853 	u32 qid;
1854 	int err;
1855 
1856 	if ((n->nlmsg_type != RTM_GETTCLASS) &&
1857 	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1858 		return -EPERM;
1859 
1860 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
1861 	if (err < 0)
1862 		return err;
1863 
1864 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1865 	if (!dev)
1866 		return -ENODEV;
1867 
1868 	/*
1869 	   parent == TC_H_UNSPEC - unspecified parent.
1870 	   parent == TC_H_ROOT   - class is root, which has no parent.
1871 	   parent == X:0	 - parent is root class.
1872 	   parent == X:Y	 - parent is a node in hierarchy.
1873 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
1874 
1875 	   handle == 0:0	 - generate handle from kernel pool.
1876 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
1877 	   handle == X:Y	 - clear.
1878 	   handle == X:0	 - root class.
1879 	 */
1880 
1881 	/* Step 1. Determine qdisc handle X:0 */
1882 
1883 	portid = tcm->tcm_parent;
1884 	clid = tcm->tcm_handle;
1885 	qid = TC_H_MAJ(clid);
1886 
1887 	if (portid != TC_H_ROOT) {
1888 		u32 qid1 = TC_H_MAJ(portid);
1889 
1890 		if (qid && qid1) {
1891 			/* If both majors are known, they must be identical. */
1892 			if (qid != qid1)
1893 				return -EINVAL;
1894 		} else if (qid1) {
1895 			qid = qid1;
1896 		} else if (qid == 0)
1897 			qid = dev->qdisc->handle;
1898 
1899 		/* Now qid is genuine qdisc handle consistent
1900 		 * both with parent and child.
1901 		 *
1902 		 * TC_H_MAJ(portid) still may be unspecified, complete it now.
1903 		 */
1904 		if (portid)
1905 			portid = TC_H_MAKE(qid, portid);
1906 	} else {
1907 		if (qid == 0)
1908 			qid = dev->qdisc->handle;
1909 	}
1910 
1911 	/* OK. Locate qdisc */
1912 	q = qdisc_lookup(dev, qid);
1913 	if (!q)
1914 		return -ENOENT;
1915 
1916 	/* An check that it supports classes */
1917 	cops = q->ops->cl_ops;
1918 	if (cops == NULL)
1919 		return -EINVAL;
1920 
1921 	/* Now try to get class */
1922 	if (clid == 0) {
1923 		if (portid == TC_H_ROOT)
1924 			clid = qid;
1925 	} else
1926 		clid = TC_H_MAKE(qid, clid);
1927 
1928 	if (clid)
1929 		cl = cops->find(q, clid);
1930 
1931 	if (cl == 0) {
1932 		err = -ENOENT;
1933 		if (n->nlmsg_type != RTM_NEWTCLASS ||
1934 		    !(n->nlmsg_flags & NLM_F_CREATE))
1935 			goto out;
1936 	} else {
1937 		switch (n->nlmsg_type) {
1938 		case RTM_NEWTCLASS:
1939 			err = -EEXIST;
1940 			if (n->nlmsg_flags & NLM_F_EXCL)
1941 				goto out;
1942 			break;
1943 		case RTM_DELTCLASS:
1944 			err = tclass_del_notify(net, cops, skb, n, q, cl);
1945 			/* Unbind the class with flilters with 0 */
1946 			tc_bind_tclass(q, portid, clid, 0);
1947 			goto out;
1948 		case RTM_GETTCLASS:
1949 			err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1950 			goto out;
1951 		default:
1952 			err = -EINVAL;
1953 			goto out;
1954 		}
1955 	}
1956 
1957 	if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1958 		NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
1959 		return -EOPNOTSUPP;
1960 	}
1961 
1962 	new_cl = cl;
1963 	err = -EOPNOTSUPP;
1964 	if (cops->change)
1965 		err = cops->change(q, clid, portid, tca, &new_cl, extack);
1966 	if (err == 0) {
1967 		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1968 		/* We just create a new class, need to do reverse binding. */
1969 		if (cl != new_cl)
1970 			tc_bind_tclass(q, portid, clid, new_cl);
1971 	}
1972 out:
1973 	return err;
1974 }
1975 
1976 struct qdisc_dump_args {
1977 	struct qdisc_walker	w;
1978 	struct sk_buff		*skb;
1979 	struct netlink_callback	*cb;
1980 };
1981 
1982 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
1983 			    struct qdisc_walker *arg)
1984 {
1985 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1986 
1987 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1988 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
1989 			      RTM_NEWTCLASS);
1990 }
1991 
1992 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1993 				struct tcmsg *tcm, struct netlink_callback *cb,
1994 				int *t_p, int s_t)
1995 {
1996 	struct qdisc_dump_args arg;
1997 
1998 	if (tc_qdisc_dump_ignore(q, false) ||
1999 	    *t_p < s_t || !q->ops->cl_ops ||
2000 	    (tcm->tcm_parent &&
2001 	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2002 		(*t_p)++;
2003 		return 0;
2004 	}
2005 	if (*t_p > s_t)
2006 		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2007 	arg.w.fn = qdisc_class_dump;
2008 	arg.skb = skb;
2009 	arg.cb = cb;
2010 	arg.w.stop  = 0;
2011 	arg.w.skip = cb->args[1];
2012 	arg.w.count = 0;
2013 	q->ops->cl_ops->walk(q, &arg.w);
2014 	cb->args[1] = arg.w.count;
2015 	if (arg.w.stop)
2016 		return -1;
2017 	(*t_p)++;
2018 	return 0;
2019 }
2020 
2021 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2022 			       struct tcmsg *tcm, struct netlink_callback *cb,
2023 			       int *t_p, int s_t)
2024 {
2025 	struct Qdisc *q;
2026 	int b;
2027 
2028 	if (!root)
2029 		return 0;
2030 
2031 	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2032 		return -1;
2033 
2034 	if (!qdisc_dev(root))
2035 		return 0;
2036 
2037 	if (tcm->tcm_parent) {
2038 		q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2039 		if (q && tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2040 			return -1;
2041 		return 0;
2042 	}
2043 	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2044 		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2045 			return -1;
2046 	}
2047 
2048 	return 0;
2049 }
2050 
2051 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2052 {
2053 	struct tcmsg *tcm = nlmsg_data(cb->nlh);
2054 	struct net *net = sock_net(skb->sk);
2055 	struct netdev_queue *dev_queue;
2056 	struct net_device *dev;
2057 	int t, s_t;
2058 
2059 	if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2060 		return 0;
2061 	dev = dev_get_by_index(net, tcm->tcm_ifindex);
2062 	if (!dev)
2063 		return 0;
2064 
2065 	s_t = cb->args[0];
2066 	t = 0;
2067 
2068 	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
2069 		goto done;
2070 
2071 	dev_queue = dev_ingress_queue(dev);
2072 	if (dev_queue &&
2073 	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2074 				&t, s_t) < 0)
2075 		goto done;
2076 
2077 done:
2078 	cb->args[0] = t;
2079 
2080 	dev_put(dev);
2081 	return skb->len;
2082 }
2083 
2084 #ifdef CONFIG_PROC_FS
2085 static int psched_show(struct seq_file *seq, void *v)
2086 {
2087 	seq_printf(seq, "%08x %08x %08x %08x\n",
2088 		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2089 		   1000000,
2090 		   (u32)NSEC_PER_SEC / hrtimer_resolution);
2091 
2092 	return 0;
2093 }
2094 
2095 static int __net_init psched_net_init(struct net *net)
2096 {
2097 	struct proc_dir_entry *e;
2098 
2099 	e = proc_create_single("psched", 0, net->proc_net, psched_show);
2100 	if (e == NULL)
2101 		return -ENOMEM;
2102 
2103 	return 0;
2104 }
2105 
2106 static void __net_exit psched_net_exit(struct net *net)
2107 {
2108 	remove_proc_entry("psched", net->proc_net);
2109 }
2110 #else
2111 static int __net_init psched_net_init(struct net *net)
2112 {
2113 	return 0;
2114 }
2115 
2116 static void __net_exit psched_net_exit(struct net *net)
2117 {
2118 }
2119 #endif
2120 
2121 static struct pernet_operations psched_net_ops = {
2122 	.init = psched_net_init,
2123 	.exit = psched_net_exit,
2124 };
2125 
2126 static int __init pktsched_init(void)
2127 {
2128 	int err;
2129 
2130 	err = register_pernet_subsys(&psched_net_ops);
2131 	if (err) {
2132 		pr_err("pktsched_init: "
2133 		       "cannot initialize per netns operations\n");
2134 		return err;
2135 	}
2136 
2137 	register_qdisc(&pfifo_fast_ops);
2138 	register_qdisc(&pfifo_qdisc_ops);
2139 	register_qdisc(&bfifo_qdisc_ops);
2140 	register_qdisc(&pfifo_head_drop_qdisc_ops);
2141 	register_qdisc(&mq_qdisc_ops);
2142 	register_qdisc(&noqueue_qdisc_ops);
2143 
2144 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2145 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2146 	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2147 		      0);
2148 	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2149 	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2150 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2151 		      0);
2152 
2153 	return 0;
2154 }
2155 
2156 subsys_initcall(pktsched_init);
2157