xref: /linux/net/sched/sch_api.c (revision 93d546399c2b7d66a54d5fbd5eee17de19246bf6)
1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 
32 #include <net/net_namespace.h>
33 #include <net/sock.h>
34 #include <net/netlink.h>
35 #include <net/pkt_sched.h>
36 
37 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
38 			struct Qdisc *old, struct Qdisc *new);
39 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
40 			 struct Qdisc *q, unsigned long cl, int event);
41 
42 /*
43 
44    Short review.
45    -------------
46 
47    This file consists of two interrelated parts:
48 
49    1. queueing disciplines manager frontend.
50    2. traffic classes manager frontend.
51 
52    Generally, queueing discipline ("qdisc") is a black box,
53    which is able to enqueue packets and to dequeue them (when
54    device is ready to send something) in order and at times
55    determined by algorithm hidden in it.
56 
57    qdisc's are divided to two categories:
58    - "queues", which have no internal structure visible from outside.
59    - "schedulers", which split all the packets to "traffic classes",
60      using "packet classifiers" (look at cls_api.c)
61 
62    In turn, classes may have child qdiscs (as rule, queues)
63    attached to them etc. etc. etc.
64 
65    The goal of the routines in this file is to translate
66    information supplied by user in the form of handles
67    to more intelligible for kernel form, to make some sanity
68    checks and part of work, which is common to all qdiscs
69    and to provide rtnetlink notifications.
70 
71    All real intelligent work is done inside qdisc modules.
72 
73 
74 
75    Every discipline has two major routines: enqueue and dequeue.
76 
77    ---dequeue
78 
79    dequeue usually returns a skb to send. It is allowed to return NULL,
80    but it does not mean that queue is empty, it just means that
81    discipline does not want to send anything this time.
82    Queue is really empty if q->q.qlen == 0.
83    For complicated disciplines with multiple queues q->q is not
84    real packet queue, but however q->q.qlen must be valid.
85 
86    ---enqueue
87 
88    enqueue returns 0, if packet was enqueued successfully.
89    If packet (this one or another one) was dropped, it returns
90    not zero error code.
91    NET_XMIT_DROP 	- this packet dropped
92      Expected action: do not backoff, but wait until queue will clear.
93    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
94      Expected action: backoff or ignore
95    NET_XMIT_POLICED	- dropped by police.
96      Expected action: backoff or error to real-time apps.
97 
98    Auxiliary routines:
99 
100    ---requeue
101 
102    requeues once dequeued packet. It is used for non-standard or
103    just buggy devices, which can defer output even if netif_queue_stopped()=0.
104 
105    ---reset
106 
107    returns qdisc to initial state: purge all buffers, clear all
108    timers, counters (except for statistics) etc.
109 
110    ---init
111 
112    initializes newly created qdisc.
113 
114    ---destroy
115 
116    destroys resources allocated by init and during lifetime of qdisc.
117 
118    ---change
119 
120    changes qdisc parameters.
121  */
122 
123 /* Protects list of registered TC modules. It is pure SMP lock. */
124 static DEFINE_RWLOCK(qdisc_mod_lock);
125 
126 
127 /************************************************
128  *	Queueing disciplines manipulation.	*
129  ************************************************/
130 
131 
132 /* The list of all installed queueing disciplines. */
133 
134 static struct Qdisc_ops *qdisc_base;
135 
136 /* Register/uregister queueing discipline */
137 
138 int register_qdisc(struct Qdisc_ops *qops)
139 {
140 	struct Qdisc_ops *q, **qp;
141 	int rc = -EEXIST;
142 
143 	write_lock(&qdisc_mod_lock);
144 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
145 		if (!strcmp(qops->id, q->id))
146 			goto out;
147 
148 	if (qops->enqueue == NULL)
149 		qops->enqueue = noop_qdisc_ops.enqueue;
150 	if (qops->requeue == NULL)
151 		qops->requeue = noop_qdisc_ops.requeue;
152 	if (qops->dequeue == NULL)
153 		qops->dequeue = noop_qdisc_ops.dequeue;
154 
155 	qops->next = NULL;
156 	*qp = qops;
157 	rc = 0;
158 out:
159 	write_unlock(&qdisc_mod_lock);
160 	return rc;
161 }
162 EXPORT_SYMBOL(register_qdisc);
163 
164 int unregister_qdisc(struct Qdisc_ops *qops)
165 {
166 	struct Qdisc_ops *q, **qp;
167 	int err = -ENOENT;
168 
169 	write_lock(&qdisc_mod_lock);
170 	for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
171 		if (q == qops)
172 			break;
173 	if (q) {
174 		*qp = q->next;
175 		q->next = NULL;
176 		err = 0;
177 	}
178 	write_unlock(&qdisc_mod_lock);
179 	return err;
180 }
181 EXPORT_SYMBOL(unregister_qdisc);
182 
183 /* We know handle. Find qdisc among all qdisc's attached to device
184    (root qdisc, all its children, children of children etc.)
185  */
186 
187 struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
188 {
189 	struct Qdisc *q;
190 
191 	if (!(root->flags & TCQ_F_BUILTIN) &&
192 	    root->handle == handle)
193 		return root;
194 
195 	list_for_each_entry(q, &root->list, list) {
196 		if (q->handle == handle)
197 			return q;
198 	}
199 	return NULL;
200 }
201 
202 /*
203  * This lock is needed until some qdiscs stop calling qdisc_tree_decrease_qlen()
204  * without rtnl_lock(); currently hfsc_dequeue(), netem_dequeue(), tbf_dequeue()
205  */
206 static DEFINE_SPINLOCK(qdisc_list_lock);
207 
208 static void qdisc_list_add(struct Qdisc *q)
209 {
210 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
211 		spin_lock_bh(&qdisc_list_lock);
212 		list_add_tail(&q->list, &qdisc_root_sleeping(q)->list);
213 		spin_unlock_bh(&qdisc_list_lock);
214 	}
215 }
216 
217 void qdisc_list_del(struct Qdisc *q)
218 {
219 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
220 		spin_lock_bh(&qdisc_list_lock);
221 		list_del(&q->list);
222 		spin_unlock_bh(&qdisc_list_lock);
223 	}
224 }
225 EXPORT_SYMBOL(qdisc_list_del);
226 
227 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
228 {
229 	unsigned int i;
230 	struct Qdisc *q;
231 
232 	spin_lock_bh(&qdisc_list_lock);
233 
234 	for (i = 0; i < dev->num_tx_queues; i++) {
235 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
236 		struct Qdisc *txq_root = txq->qdisc_sleeping;
237 
238 		q = qdisc_match_from_root(txq_root, handle);
239 		if (q)
240 			goto unlock;
241 	}
242 
243 	q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
244 
245 unlock:
246 	spin_unlock_bh(&qdisc_list_lock);
247 
248 	return q;
249 }
250 
251 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
252 {
253 	unsigned long cl;
254 	struct Qdisc *leaf;
255 	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
256 
257 	if (cops == NULL)
258 		return NULL;
259 	cl = cops->get(p, classid);
260 
261 	if (cl == 0)
262 		return NULL;
263 	leaf = cops->leaf(p, cl);
264 	cops->put(p, cl);
265 	return leaf;
266 }
267 
268 /* Find queueing discipline by name */
269 
270 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
271 {
272 	struct Qdisc_ops *q = NULL;
273 
274 	if (kind) {
275 		read_lock(&qdisc_mod_lock);
276 		for (q = qdisc_base; q; q = q->next) {
277 			if (nla_strcmp(kind, q->id) == 0) {
278 				if (!try_module_get(q->owner))
279 					q = NULL;
280 				break;
281 			}
282 		}
283 		read_unlock(&qdisc_mod_lock);
284 	}
285 	return q;
286 }
287 
288 static struct qdisc_rate_table *qdisc_rtab_list;
289 
290 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
291 {
292 	struct qdisc_rate_table *rtab;
293 
294 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
295 		if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
296 			rtab->refcnt++;
297 			return rtab;
298 		}
299 	}
300 
301 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
302 	    nla_len(tab) != TC_RTAB_SIZE)
303 		return NULL;
304 
305 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
306 	if (rtab) {
307 		rtab->rate = *r;
308 		rtab->refcnt = 1;
309 		memcpy(rtab->data, nla_data(tab), 1024);
310 		rtab->next = qdisc_rtab_list;
311 		qdisc_rtab_list = rtab;
312 	}
313 	return rtab;
314 }
315 EXPORT_SYMBOL(qdisc_get_rtab);
316 
317 void qdisc_put_rtab(struct qdisc_rate_table *tab)
318 {
319 	struct qdisc_rate_table *rtab, **rtabp;
320 
321 	if (!tab || --tab->refcnt)
322 		return;
323 
324 	for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
325 		if (rtab == tab) {
326 			*rtabp = rtab->next;
327 			kfree(rtab);
328 			return;
329 		}
330 	}
331 }
332 EXPORT_SYMBOL(qdisc_put_rtab);
333 
334 static LIST_HEAD(qdisc_stab_list);
335 static DEFINE_SPINLOCK(qdisc_stab_lock);
336 
337 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
338 	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
339 	[TCA_STAB_DATA] = { .type = NLA_BINARY },
340 };
341 
342 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
343 {
344 	struct nlattr *tb[TCA_STAB_MAX + 1];
345 	struct qdisc_size_table *stab;
346 	struct tc_sizespec *s;
347 	unsigned int tsize = 0;
348 	u16 *tab = NULL;
349 	int err;
350 
351 	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
352 	if (err < 0)
353 		return ERR_PTR(err);
354 	if (!tb[TCA_STAB_BASE])
355 		return ERR_PTR(-EINVAL);
356 
357 	s = nla_data(tb[TCA_STAB_BASE]);
358 
359 	if (s->tsize > 0) {
360 		if (!tb[TCA_STAB_DATA])
361 			return ERR_PTR(-EINVAL);
362 		tab = nla_data(tb[TCA_STAB_DATA]);
363 		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
364 	}
365 
366 	if (!s || tsize != s->tsize || (!tab && tsize > 0))
367 		return ERR_PTR(-EINVAL);
368 
369 	spin_lock(&qdisc_stab_lock);
370 
371 	list_for_each_entry(stab, &qdisc_stab_list, list) {
372 		if (memcmp(&stab->szopts, s, sizeof(*s)))
373 			continue;
374 		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
375 			continue;
376 		stab->refcnt++;
377 		spin_unlock(&qdisc_stab_lock);
378 		return stab;
379 	}
380 
381 	spin_unlock(&qdisc_stab_lock);
382 
383 	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
384 	if (!stab)
385 		return ERR_PTR(-ENOMEM);
386 
387 	stab->refcnt = 1;
388 	stab->szopts = *s;
389 	if (tsize > 0)
390 		memcpy(stab->data, tab, tsize * sizeof(u16));
391 
392 	spin_lock(&qdisc_stab_lock);
393 	list_add_tail(&stab->list, &qdisc_stab_list);
394 	spin_unlock(&qdisc_stab_lock);
395 
396 	return stab;
397 }
398 
399 void qdisc_put_stab(struct qdisc_size_table *tab)
400 {
401 	if (!tab)
402 		return;
403 
404 	spin_lock(&qdisc_stab_lock);
405 
406 	if (--tab->refcnt == 0) {
407 		list_del(&tab->list);
408 		kfree(tab);
409 	}
410 
411 	spin_unlock(&qdisc_stab_lock);
412 }
413 EXPORT_SYMBOL(qdisc_put_stab);
414 
415 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
416 {
417 	struct nlattr *nest;
418 
419 	nest = nla_nest_start(skb, TCA_STAB);
420 	if (nest == NULL)
421 		goto nla_put_failure;
422 	NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
423 	nla_nest_end(skb, nest);
424 
425 	return skb->len;
426 
427 nla_put_failure:
428 	return -1;
429 }
430 
431 void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
432 {
433 	int pkt_len, slot;
434 
435 	pkt_len = skb->len + stab->szopts.overhead;
436 	if (unlikely(!stab->szopts.tsize))
437 		goto out;
438 
439 	slot = pkt_len + stab->szopts.cell_align;
440 	if (unlikely(slot < 0))
441 		slot = 0;
442 
443 	slot >>= stab->szopts.cell_log;
444 	if (likely(slot < stab->szopts.tsize))
445 		pkt_len = stab->data[slot];
446 	else
447 		pkt_len = stab->data[stab->szopts.tsize - 1] *
448 				(slot / stab->szopts.tsize) +
449 				stab->data[slot % stab->szopts.tsize];
450 
451 	pkt_len <<= stab->szopts.size_log;
452 out:
453 	if (unlikely(pkt_len < 1))
454 		pkt_len = 1;
455 	qdisc_skb_cb(skb)->pkt_len = pkt_len;
456 }
457 EXPORT_SYMBOL(qdisc_calculate_pkt_len);
458 
459 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
460 {
461 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
462 						 timer);
463 
464 	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
465 	smp_wmb();
466 	__netif_schedule(qdisc_root(wd->qdisc));
467 
468 	return HRTIMER_NORESTART;
469 }
470 
471 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
472 {
473 	hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
474 	wd->timer.function = qdisc_watchdog;
475 	wd->qdisc = qdisc;
476 }
477 EXPORT_SYMBOL(qdisc_watchdog_init);
478 
479 void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
480 {
481 	ktime_t time;
482 
483 	if (test_bit(__QDISC_STATE_DEACTIVATED,
484 		     &qdisc_root_sleeping(wd->qdisc)->state))
485 		return;
486 
487 	wd->qdisc->flags |= TCQ_F_THROTTLED;
488 	time = ktime_set(0, 0);
489 	time = ktime_add_ns(time, PSCHED_US2NS(expires));
490 	hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
491 }
492 EXPORT_SYMBOL(qdisc_watchdog_schedule);
493 
494 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
495 {
496 	hrtimer_cancel(&wd->timer);
497 	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
498 }
499 EXPORT_SYMBOL(qdisc_watchdog_cancel);
500 
501 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
502 {
503 	unsigned int size = n * sizeof(struct hlist_head), i;
504 	struct hlist_head *h;
505 
506 	if (size <= PAGE_SIZE)
507 		h = kmalloc(size, GFP_KERNEL);
508 	else
509 		h = (struct hlist_head *)
510 			__get_free_pages(GFP_KERNEL, get_order(size));
511 
512 	if (h != NULL) {
513 		for (i = 0; i < n; i++)
514 			INIT_HLIST_HEAD(&h[i]);
515 	}
516 	return h;
517 }
518 
519 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
520 {
521 	unsigned int size = n * sizeof(struct hlist_head);
522 
523 	if (size <= PAGE_SIZE)
524 		kfree(h);
525 	else
526 		free_pages((unsigned long)h, get_order(size));
527 }
528 
529 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
530 {
531 	struct Qdisc_class_common *cl;
532 	struct hlist_node *n, *next;
533 	struct hlist_head *nhash, *ohash;
534 	unsigned int nsize, nmask, osize;
535 	unsigned int i, h;
536 
537 	/* Rehash when load factor exceeds 0.75 */
538 	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
539 		return;
540 	nsize = clhash->hashsize * 2;
541 	nmask = nsize - 1;
542 	nhash = qdisc_class_hash_alloc(nsize);
543 	if (nhash == NULL)
544 		return;
545 
546 	ohash = clhash->hash;
547 	osize = clhash->hashsize;
548 
549 	sch_tree_lock(sch);
550 	for (i = 0; i < osize; i++) {
551 		hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
552 			h = qdisc_class_hash(cl->classid, nmask);
553 			hlist_add_head(&cl->hnode, &nhash[h]);
554 		}
555 	}
556 	clhash->hash     = nhash;
557 	clhash->hashsize = nsize;
558 	clhash->hashmask = nmask;
559 	sch_tree_unlock(sch);
560 
561 	qdisc_class_hash_free(ohash, osize);
562 }
563 EXPORT_SYMBOL(qdisc_class_hash_grow);
564 
565 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
566 {
567 	unsigned int size = 4;
568 
569 	clhash->hash = qdisc_class_hash_alloc(size);
570 	if (clhash->hash == NULL)
571 		return -ENOMEM;
572 	clhash->hashsize  = size;
573 	clhash->hashmask  = size - 1;
574 	clhash->hashelems = 0;
575 	return 0;
576 }
577 EXPORT_SYMBOL(qdisc_class_hash_init);
578 
579 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
580 {
581 	qdisc_class_hash_free(clhash->hash, clhash->hashsize);
582 }
583 EXPORT_SYMBOL(qdisc_class_hash_destroy);
584 
585 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
586 			     struct Qdisc_class_common *cl)
587 {
588 	unsigned int h;
589 
590 	INIT_HLIST_NODE(&cl->hnode);
591 	h = qdisc_class_hash(cl->classid, clhash->hashmask);
592 	hlist_add_head(&cl->hnode, &clhash->hash[h]);
593 	clhash->hashelems++;
594 }
595 EXPORT_SYMBOL(qdisc_class_hash_insert);
596 
597 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
598 			     struct Qdisc_class_common *cl)
599 {
600 	hlist_del(&cl->hnode);
601 	clhash->hashelems--;
602 }
603 EXPORT_SYMBOL(qdisc_class_hash_remove);
604 
605 /* Allocate an unique handle from space managed by kernel */
606 
607 static u32 qdisc_alloc_handle(struct net_device *dev)
608 {
609 	int i = 0x10000;
610 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
611 
612 	do {
613 		autohandle += TC_H_MAKE(0x10000U, 0);
614 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
615 			autohandle = TC_H_MAKE(0x80000000U, 0);
616 	} while	(qdisc_lookup(dev, autohandle) && --i > 0);
617 
618 	return i>0 ? autohandle : 0;
619 }
620 
621 /* Attach toplevel qdisc to device queue. */
622 
623 static struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
624 				     struct Qdisc *qdisc)
625 {
626 	struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
627 	spinlock_t *root_lock;
628 
629 	root_lock = qdisc_lock(oqdisc);
630 	spin_lock_bh(root_lock);
631 
632 	/* Prune old scheduler */
633 	if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
634 		qdisc_reset(oqdisc);
635 
636 	/* ... and graft new one */
637 	if (qdisc == NULL)
638 		qdisc = &noop_qdisc;
639 	dev_queue->qdisc_sleeping = qdisc;
640 	rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
641 
642 	spin_unlock_bh(root_lock);
643 
644 	return oqdisc;
645 }
646 
647 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
648 {
649 	const struct Qdisc_class_ops *cops;
650 	unsigned long cl;
651 	u32 parentid;
652 
653 	if (n == 0)
654 		return;
655 	while ((parentid = sch->parent)) {
656 		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
657 			return;
658 
659 		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
660 		if (sch == NULL) {
661 			WARN_ON(parentid != TC_H_ROOT);
662 			return;
663 		}
664 		cops = sch->ops->cl_ops;
665 		if (cops->qlen_notify) {
666 			cl = cops->get(sch, parentid);
667 			cops->qlen_notify(sch, cl);
668 			cops->put(sch, cl);
669 		}
670 		sch->q.qlen -= n;
671 	}
672 }
673 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
674 
675 static void notify_and_destroy(struct sk_buff *skb, struct nlmsghdr *n, u32 clid,
676 			       struct Qdisc *old, struct Qdisc *new)
677 {
678 	if (new || old)
679 		qdisc_notify(skb, n, clid, old, new);
680 
681 	if (old)
682 		qdisc_destroy(old);
683 }
684 
685 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
686  * to device "dev".
687  *
688  * When appropriate send a netlink notification using 'skb'
689  * and "n".
690  *
691  * On success, destroy old qdisc.
692  */
693 
694 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
695 		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
696 		       struct Qdisc *new, struct Qdisc *old)
697 {
698 	struct Qdisc *q = old;
699 	int err = 0;
700 
701 	if (parent == NULL) {
702 		unsigned int i, num_q, ingress;
703 
704 		ingress = 0;
705 		num_q = dev->num_tx_queues;
706 		if ((q && q->flags & TCQ_F_INGRESS) ||
707 		    (new && new->flags & TCQ_F_INGRESS)) {
708 			num_q = 1;
709 			ingress = 1;
710 		}
711 
712 		if (dev->flags & IFF_UP)
713 			dev_deactivate(dev);
714 
715 		for (i = 0; i < num_q; i++) {
716 			struct netdev_queue *dev_queue = &dev->rx_queue;
717 
718 			if (!ingress)
719 				dev_queue = netdev_get_tx_queue(dev, i);
720 
721 			old = dev_graft_qdisc(dev_queue, new);
722 			if (new && i > 0)
723 				atomic_inc(&new->refcnt);
724 
725 			notify_and_destroy(skb, n, classid, old, new);
726 		}
727 
728 		if (dev->flags & IFF_UP)
729 			dev_activate(dev);
730 	} else {
731 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
732 
733 		err = -EINVAL;
734 
735 		if (cops) {
736 			unsigned long cl = cops->get(parent, classid);
737 			if (cl) {
738 				err = cops->graft(parent, cl, new, &old);
739 				cops->put(parent, cl);
740 			}
741 		}
742 		if (!err)
743 			notify_and_destroy(skb, n, classid, old, new);
744 	}
745 	return err;
746 }
747 
748 /* lockdep annotation is needed for ingress; egress gets it only for name */
749 static struct lock_class_key qdisc_tx_lock;
750 static struct lock_class_key qdisc_rx_lock;
751 
752 /*
753    Allocate and initialize new qdisc.
754 
755    Parameters are passed via opt.
756  */
757 
758 static struct Qdisc *
759 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
760 	     u32 parent, u32 handle, struct nlattr **tca, int *errp)
761 {
762 	int err;
763 	struct nlattr *kind = tca[TCA_KIND];
764 	struct Qdisc *sch;
765 	struct Qdisc_ops *ops;
766 	struct qdisc_size_table *stab;
767 
768 	ops = qdisc_lookup_ops(kind);
769 #ifdef CONFIG_MODULES
770 	if (ops == NULL && kind != NULL) {
771 		char name[IFNAMSIZ];
772 		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
773 			/* We dropped the RTNL semaphore in order to
774 			 * perform the module load.  So, even if we
775 			 * succeeded in loading the module we have to
776 			 * tell the caller to replay the request.  We
777 			 * indicate this using -EAGAIN.
778 			 * We replay the request because the device may
779 			 * go away in the mean time.
780 			 */
781 			rtnl_unlock();
782 			request_module("sch_%s", name);
783 			rtnl_lock();
784 			ops = qdisc_lookup_ops(kind);
785 			if (ops != NULL) {
786 				/* We will try again qdisc_lookup_ops,
787 				 * so don't keep a reference.
788 				 */
789 				module_put(ops->owner);
790 				err = -EAGAIN;
791 				goto err_out;
792 			}
793 		}
794 	}
795 #endif
796 
797 	err = -ENOENT;
798 	if (ops == NULL)
799 		goto err_out;
800 
801 	sch = qdisc_alloc(dev_queue, ops);
802 	if (IS_ERR(sch)) {
803 		err = PTR_ERR(sch);
804 		goto err_out2;
805 	}
806 
807 	sch->parent = parent;
808 
809 	if (handle == TC_H_INGRESS) {
810 		sch->flags |= TCQ_F_INGRESS;
811 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
812 		lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
813 	} else {
814 		if (handle == 0) {
815 			handle = qdisc_alloc_handle(dev);
816 			err = -ENOMEM;
817 			if (handle == 0)
818 				goto err_out3;
819 		}
820 		lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
821 	}
822 
823 	sch->handle = handle;
824 
825 	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
826 		if (tca[TCA_STAB]) {
827 			stab = qdisc_get_stab(tca[TCA_STAB]);
828 			if (IS_ERR(stab)) {
829 				err = PTR_ERR(stab);
830 				goto err_out3;
831 			}
832 			sch->stab = stab;
833 		}
834 		if (tca[TCA_RATE]) {
835 			spinlock_t *root_lock;
836 
837 			if ((sch->parent != TC_H_ROOT) &&
838 			    !(sch->flags & TCQ_F_INGRESS))
839 				root_lock = qdisc_root_sleeping_lock(sch);
840 			else
841 				root_lock = qdisc_lock(sch);
842 
843 			err = gen_new_estimator(&sch->bstats, &sch->rate_est,
844 						root_lock, tca[TCA_RATE]);
845 			if (err) {
846 				/*
847 				 * Any broken qdiscs that would require
848 				 * a ops->reset() here? The qdisc was never
849 				 * in action so it shouldn't be necessary.
850 				 */
851 				if (ops->destroy)
852 					ops->destroy(sch);
853 				goto err_out3;
854 			}
855 		}
856 
857 		qdisc_list_add(sch);
858 
859 		return sch;
860 	}
861 err_out3:
862 	qdisc_put_stab(sch->stab);
863 	dev_put(dev);
864 	kfree((char *) sch - sch->padded);
865 err_out2:
866 	module_put(ops->owner);
867 err_out:
868 	*errp = err;
869 	return NULL;
870 }
871 
872 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
873 {
874 	struct qdisc_size_table *stab = NULL;
875 	int err = 0;
876 
877 	if (tca[TCA_OPTIONS]) {
878 		if (sch->ops->change == NULL)
879 			return -EINVAL;
880 		err = sch->ops->change(sch, tca[TCA_OPTIONS]);
881 		if (err)
882 			return err;
883 	}
884 
885 	if (tca[TCA_STAB]) {
886 		stab = qdisc_get_stab(tca[TCA_STAB]);
887 		if (IS_ERR(stab))
888 			return PTR_ERR(stab);
889 	}
890 
891 	qdisc_put_stab(sch->stab);
892 	sch->stab = stab;
893 
894 	if (tca[TCA_RATE])
895 		gen_replace_estimator(&sch->bstats, &sch->rate_est,
896 				      qdisc_root_sleeping_lock(sch),
897 				      tca[TCA_RATE]);
898 	return 0;
899 }
900 
901 struct check_loop_arg
902 {
903 	struct qdisc_walker 	w;
904 	struct Qdisc		*p;
905 	int			depth;
906 };
907 
908 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
909 
910 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
911 {
912 	struct check_loop_arg	arg;
913 
914 	if (q->ops->cl_ops == NULL)
915 		return 0;
916 
917 	arg.w.stop = arg.w.skip = arg.w.count = 0;
918 	arg.w.fn = check_loop_fn;
919 	arg.depth = depth;
920 	arg.p = p;
921 	q->ops->cl_ops->walk(q, &arg.w);
922 	return arg.w.stop ? -ELOOP : 0;
923 }
924 
925 static int
926 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
927 {
928 	struct Qdisc *leaf;
929 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
930 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
931 
932 	leaf = cops->leaf(q, cl);
933 	if (leaf) {
934 		if (leaf == arg->p || arg->depth > 7)
935 			return -ELOOP;
936 		return check_loop(leaf, arg->p, arg->depth + 1);
937 	}
938 	return 0;
939 }
940 
941 /*
942  * Delete/get qdisc.
943  */
944 
945 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
946 {
947 	struct net *net = sock_net(skb->sk);
948 	struct tcmsg *tcm = NLMSG_DATA(n);
949 	struct nlattr *tca[TCA_MAX + 1];
950 	struct net_device *dev;
951 	u32 clid = tcm->tcm_parent;
952 	struct Qdisc *q = NULL;
953 	struct Qdisc *p = NULL;
954 	int err;
955 
956 	if (net != &init_net)
957 		return -EINVAL;
958 
959 	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
960 		return -ENODEV;
961 
962 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
963 	if (err < 0)
964 		return err;
965 
966 	if (clid) {
967 		if (clid != TC_H_ROOT) {
968 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
969 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
970 					return -ENOENT;
971 				q = qdisc_leaf(p, clid);
972 			} else { /* ingress */
973 				q = dev->rx_queue.qdisc_sleeping;
974 			}
975 		} else {
976 			struct netdev_queue *dev_queue;
977 			dev_queue = netdev_get_tx_queue(dev, 0);
978 			q = dev_queue->qdisc_sleeping;
979 		}
980 		if (!q)
981 			return -ENOENT;
982 
983 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
984 			return -EINVAL;
985 	} else {
986 		if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
987 			return -ENOENT;
988 	}
989 
990 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
991 		return -EINVAL;
992 
993 	if (n->nlmsg_type == RTM_DELQDISC) {
994 		if (!clid)
995 			return -EINVAL;
996 		if (q->handle == 0)
997 			return -ENOENT;
998 		if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
999 			return err;
1000 	} else {
1001 		qdisc_notify(skb, n, clid, NULL, q);
1002 	}
1003 	return 0;
1004 }
1005 
1006 /*
1007    Create/change qdisc.
1008  */
1009 
1010 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1011 {
1012 	struct net *net = sock_net(skb->sk);
1013 	struct tcmsg *tcm;
1014 	struct nlattr *tca[TCA_MAX + 1];
1015 	struct net_device *dev;
1016 	u32 clid;
1017 	struct Qdisc *q, *p;
1018 	int err;
1019 
1020 	if (net != &init_net)
1021 		return -EINVAL;
1022 
1023 replay:
1024 	/* Reinit, just in case something touches this. */
1025 	tcm = NLMSG_DATA(n);
1026 	clid = tcm->tcm_parent;
1027 	q = p = NULL;
1028 
1029 	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1030 		return -ENODEV;
1031 
1032 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1033 	if (err < 0)
1034 		return err;
1035 
1036 	if (clid) {
1037 		if (clid != TC_H_ROOT) {
1038 			if (clid != TC_H_INGRESS) {
1039 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
1040 					return -ENOENT;
1041 				q = qdisc_leaf(p, clid);
1042 			} else { /*ingress */
1043 				q = dev->rx_queue.qdisc_sleeping;
1044 			}
1045 		} else {
1046 			struct netdev_queue *dev_queue;
1047 			dev_queue = netdev_get_tx_queue(dev, 0);
1048 			q = dev_queue->qdisc_sleeping;
1049 		}
1050 
1051 		/* It may be default qdisc, ignore it */
1052 		if (q && q->handle == 0)
1053 			q = NULL;
1054 
1055 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1056 			if (tcm->tcm_handle) {
1057 				if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
1058 					return -EEXIST;
1059 				if (TC_H_MIN(tcm->tcm_handle))
1060 					return -EINVAL;
1061 				if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1062 					goto create_n_graft;
1063 				if (n->nlmsg_flags&NLM_F_EXCL)
1064 					return -EEXIST;
1065 				if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1066 					return -EINVAL;
1067 				if (q == p ||
1068 				    (p && check_loop(q, p, 0)))
1069 					return -ELOOP;
1070 				atomic_inc(&q->refcnt);
1071 				goto graft;
1072 			} else {
1073 				if (q == NULL)
1074 					goto create_n_graft;
1075 
1076 				/* This magic test requires explanation.
1077 				 *
1078 				 *   We know, that some child q is already
1079 				 *   attached to this parent and have choice:
1080 				 *   either to change it or to create/graft new one.
1081 				 *
1082 				 *   1. We are allowed to create/graft only
1083 				 *   if CREATE and REPLACE flags are set.
1084 				 *
1085 				 *   2. If EXCL is set, requestor wanted to say,
1086 				 *   that qdisc tcm_handle is not expected
1087 				 *   to exist, so that we choose create/graft too.
1088 				 *
1089 				 *   3. The last case is when no flags are set.
1090 				 *   Alas, it is sort of hole in API, we
1091 				 *   cannot decide what to do unambiguously.
1092 				 *   For now we select create/graft, if
1093 				 *   user gave KIND, which does not match existing.
1094 				 */
1095 				if ((n->nlmsg_flags&NLM_F_CREATE) &&
1096 				    (n->nlmsg_flags&NLM_F_REPLACE) &&
1097 				    ((n->nlmsg_flags&NLM_F_EXCL) ||
1098 				     (tca[TCA_KIND] &&
1099 				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1100 					goto create_n_graft;
1101 			}
1102 		}
1103 	} else {
1104 		if (!tcm->tcm_handle)
1105 			return -EINVAL;
1106 		q = qdisc_lookup(dev, tcm->tcm_handle);
1107 	}
1108 
1109 	/* Change qdisc parameters */
1110 	if (q == NULL)
1111 		return -ENOENT;
1112 	if (n->nlmsg_flags&NLM_F_EXCL)
1113 		return -EEXIST;
1114 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1115 		return -EINVAL;
1116 	err = qdisc_change(q, tca);
1117 	if (err == 0)
1118 		qdisc_notify(skb, n, clid, NULL, q);
1119 	return err;
1120 
1121 create_n_graft:
1122 	if (!(n->nlmsg_flags&NLM_F_CREATE))
1123 		return -ENOENT;
1124 	if (clid == TC_H_INGRESS)
1125 		q = qdisc_create(dev, &dev->rx_queue,
1126 				 tcm->tcm_parent, tcm->tcm_parent,
1127 				 tca, &err);
1128 	else
1129 		q = qdisc_create(dev, netdev_get_tx_queue(dev, 0),
1130 				 tcm->tcm_parent, tcm->tcm_handle,
1131 				 tca, &err);
1132 	if (q == NULL) {
1133 		if (err == -EAGAIN)
1134 			goto replay;
1135 		return err;
1136 	}
1137 
1138 graft:
1139 	err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1140 	if (err) {
1141 		if (q)
1142 			qdisc_destroy(q);
1143 		return err;
1144 	}
1145 
1146 	return 0;
1147 }
1148 
1149 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1150 			 u32 pid, u32 seq, u16 flags, int event)
1151 {
1152 	struct tcmsg *tcm;
1153 	struct nlmsghdr  *nlh;
1154 	unsigned char *b = skb_tail_pointer(skb);
1155 	struct gnet_dump d;
1156 
1157 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1158 	tcm = NLMSG_DATA(nlh);
1159 	tcm->tcm_family = AF_UNSPEC;
1160 	tcm->tcm__pad1 = 0;
1161 	tcm->tcm__pad2 = 0;
1162 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1163 	tcm->tcm_parent = clid;
1164 	tcm->tcm_handle = q->handle;
1165 	tcm->tcm_info = atomic_read(&q->refcnt);
1166 	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1167 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
1168 		goto nla_put_failure;
1169 	q->qstats.qlen = q->q.qlen;
1170 
1171 	if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
1172 		goto nla_put_failure;
1173 
1174 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1175 					 qdisc_root_sleeping_lock(q), &d) < 0)
1176 		goto nla_put_failure;
1177 
1178 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1179 		goto nla_put_failure;
1180 
1181 	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1182 	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
1183 	    gnet_stats_copy_queue(&d, &q->qstats) < 0)
1184 		goto nla_put_failure;
1185 
1186 	if (gnet_stats_finish_copy(&d) < 0)
1187 		goto nla_put_failure;
1188 
1189 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1190 	return skb->len;
1191 
1192 nlmsg_failure:
1193 nla_put_failure:
1194 	nlmsg_trim(skb, b);
1195 	return -1;
1196 }
1197 
1198 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1199 			u32 clid, struct Qdisc *old, struct Qdisc *new)
1200 {
1201 	struct sk_buff *skb;
1202 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1203 
1204 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1205 	if (!skb)
1206 		return -ENOBUFS;
1207 
1208 	if (old && old->handle) {
1209 		if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1210 			goto err_out;
1211 	}
1212 	if (new) {
1213 		if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1214 			goto err_out;
1215 	}
1216 
1217 	if (skb->len)
1218 		return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1219 
1220 err_out:
1221 	kfree_skb(skb);
1222 	return -EINVAL;
1223 }
1224 
1225 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1226 {
1227 	return (q->flags & TCQ_F_BUILTIN) ? true : false;
1228 }
1229 
1230 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1231 			      struct netlink_callback *cb,
1232 			      int *q_idx_p, int s_q_idx)
1233 {
1234 	int ret = 0, q_idx = *q_idx_p;
1235 	struct Qdisc *q;
1236 
1237 	if (!root)
1238 		return 0;
1239 
1240 	q = root;
1241 	if (q_idx < s_q_idx) {
1242 		q_idx++;
1243 	} else {
1244 		if (!tc_qdisc_dump_ignore(q) &&
1245 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1246 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1247 			goto done;
1248 		q_idx++;
1249 	}
1250 	list_for_each_entry(q, &root->list, list) {
1251 		if (q_idx < s_q_idx) {
1252 			q_idx++;
1253 			continue;
1254 		}
1255 		if (!tc_qdisc_dump_ignore(q) &&
1256 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1257 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1258 			goto done;
1259 		q_idx++;
1260 	}
1261 
1262 out:
1263 	*q_idx_p = q_idx;
1264 	return ret;
1265 done:
1266 	ret = -1;
1267 	goto out;
1268 }
1269 
1270 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1271 {
1272 	struct net *net = sock_net(skb->sk);
1273 	int idx, q_idx;
1274 	int s_idx, s_q_idx;
1275 	struct net_device *dev;
1276 
1277 	if (net != &init_net)
1278 		return 0;
1279 
1280 	s_idx = cb->args[0];
1281 	s_q_idx = q_idx = cb->args[1];
1282 	read_lock(&dev_base_lock);
1283 	idx = 0;
1284 	for_each_netdev(&init_net, dev) {
1285 		struct netdev_queue *dev_queue;
1286 
1287 		if (idx < s_idx)
1288 			goto cont;
1289 		if (idx > s_idx)
1290 			s_q_idx = 0;
1291 		q_idx = 0;
1292 
1293 		dev_queue = netdev_get_tx_queue(dev, 0);
1294 		if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1295 			goto done;
1296 
1297 		dev_queue = &dev->rx_queue;
1298 		if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1299 			goto done;
1300 
1301 cont:
1302 		idx++;
1303 	}
1304 
1305 done:
1306 	read_unlock(&dev_base_lock);
1307 
1308 	cb->args[0] = idx;
1309 	cb->args[1] = q_idx;
1310 
1311 	return skb->len;
1312 }
1313 
1314 
1315 
1316 /************************************************
1317  *	Traffic classes manipulation.		*
1318  ************************************************/
1319 
1320 
1321 
1322 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1323 {
1324 	struct net *net = sock_net(skb->sk);
1325 	struct netdev_queue *dev_queue;
1326 	struct tcmsg *tcm = NLMSG_DATA(n);
1327 	struct nlattr *tca[TCA_MAX + 1];
1328 	struct net_device *dev;
1329 	struct Qdisc *q = NULL;
1330 	const struct Qdisc_class_ops *cops;
1331 	unsigned long cl = 0;
1332 	unsigned long new_cl;
1333 	u32 pid = tcm->tcm_parent;
1334 	u32 clid = tcm->tcm_handle;
1335 	u32 qid = TC_H_MAJ(clid);
1336 	int err;
1337 
1338 	if (net != &init_net)
1339 		return -EINVAL;
1340 
1341 	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1342 		return -ENODEV;
1343 
1344 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1345 	if (err < 0)
1346 		return err;
1347 
1348 	/*
1349 	   parent == TC_H_UNSPEC - unspecified parent.
1350 	   parent == TC_H_ROOT   - class is root, which has no parent.
1351 	   parent == X:0	 - parent is root class.
1352 	   parent == X:Y	 - parent is a node in hierarchy.
1353 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
1354 
1355 	   handle == 0:0	 - generate handle from kernel pool.
1356 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
1357 	   handle == X:Y	 - clear.
1358 	   handle == X:0	 - root class.
1359 	 */
1360 
1361 	/* Step 1. Determine qdisc handle X:0 */
1362 
1363 	dev_queue = netdev_get_tx_queue(dev, 0);
1364 	if (pid != TC_H_ROOT) {
1365 		u32 qid1 = TC_H_MAJ(pid);
1366 
1367 		if (qid && qid1) {
1368 			/* If both majors are known, they must be identical. */
1369 			if (qid != qid1)
1370 				return -EINVAL;
1371 		} else if (qid1) {
1372 			qid = qid1;
1373 		} else if (qid == 0)
1374 			qid = dev_queue->qdisc_sleeping->handle;
1375 
1376 		/* Now qid is genuine qdisc handle consistent
1377 		   both with parent and child.
1378 
1379 		   TC_H_MAJ(pid) still may be unspecified, complete it now.
1380 		 */
1381 		if (pid)
1382 			pid = TC_H_MAKE(qid, pid);
1383 	} else {
1384 		if (qid == 0)
1385 			qid = dev_queue->qdisc_sleeping->handle;
1386 	}
1387 
1388 	/* OK. Locate qdisc */
1389 	if ((q = qdisc_lookup(dev, qid)) == NULL)
1390 		return -ENOENT;
1391 
1392 	/* An check that it supports classes */
1393 	cops = q->ops->cl_ops;
1394 	if (cops == NULL)
1395 		return -EINVAL;
1396 
1397 	/* Now try to get class */
1398 	if (clid == 0) {
1399 		if (pid == TC_H_ROOT)
1400 			clid = qid;
1401 	} else
1402 		clid = TC_H_MAKE(qid, clid);
1403 
1404 	if (clid)
1405 		cl = cops->get(q, clid);
1406 
1407 	if (cl == 0) {
1408 		err = -ENOENT;
1409 		if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1410 			goto out;
1411 	} else {
1412 		switch (n->nlmsg_type) {
1413 		case RTM_NEWTCLASS:
1414 			err = -EEXIST;
1415 			if (n->nlmsg_flags&NLM_F_EXCL)
1416 				goto out;
1417 			break;
1418 		case RTM_DELTCLASS:
1419 			err = cops->delete(q, cl);
1420 			if (err == 0)
1421 				tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1422 			goto out;
1423 		case RTM_GETTCLASS:
1424 			err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1425 			goto out;
1426 		default:
1427 			err = -EINVAL;
1428 			goto out;
1429 		}
1430 	}
1431 
1432 	new_cl = cl;
1433 	err = cops->change(q, clid, pid, tca, &new_cl);
1434 	if (err == 0)
1435 		tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1436 
1437 out:
1438 	if (cl)
1439 		cops->put(q, cl);
1440 
1441 	return err;
1442 }
1443 
1444 
1445 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1446 			  unsigned long cl,
1447 			  u32 pid, u32 seq, u16 flags, int event)
1448 {
1449 	struct tcmsg *tcm;
1450 	struct nlmsghdr  *nlh;
1451 	unsigned char *b = skb_tail_pointer(skb);
1452 	struct gnet_dump d;
1453 	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1454 
1455 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1456 	tcm = NLMSG_DATA(nlh);
1457 	tcm->tcm_family = AF_UNSPEC;
1458 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1459 	tcm->tcm_parent = q->handle;
1460 	tcm->tcm_handle = q->handle;
1461 	tcm->tcm_info = 0;
1462 	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1463 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1464 		goto nla_put_failure;
1465 
1466 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1467 					 qdisc_root_sleeping_lock(q), &d) < 0)
1468 		goto nla_put_failure;
1469 
1470 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1471 		goto nla_put_failure;
1472 
1473 	if (gnet_stats_finish_copy(&d) < 0)
1474 		goto nla_put_failure;
1475 
1476 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1477 	return skb->len;
1478 
1479 nlmsg_failure:
1480 nla_put_failure:
1481 	nlmsg_trim(skb, b);
1482 	return -1;
1483 }
1484 
1485 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1486 			  struct Qdisc *q, unsigned long cl, int event)
1487 {
1488 	struct sk_buff *skb;
1489 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1490 
1491 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1492 	if (!skb)
1493 		return -ENOBUFS;
1494 
1495 	if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1496 		kfree_skb(skb);
1497 		return -EINVAL;
1498 	}
1499 
1500 	return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1501 }
1502 
1503 struct qdisc_dump_args
1504 {
1505 	struct qdisc_walker w;
1506 	struct sk_buff *skb;
1507 	struct netlink_callback *cb;
1508 };
1509 
1510 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1511 {
1512 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1513 
1514 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1515 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1516 }
1517 
1518 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1519 				struct tcmsg *tcm, struct netlink_callback *cb,
1520 				int *t_p, int s_t)
1521 {
1522 	struct qdisc_dump_args arg;
1523 
1524 	if (tc_qdisc_dump_ignore(q) ||
1525 	    *t_p < s_t || !q->ops->cl_ops ||
1526 	    (tcm->tcm_parent &&
1527 	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1528 		(*t_p)++;
1529 		return 0;
1530 	}
1531 	if (*t_p > s_t)
1532 		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1533 	arg.w.fn = qdisc_class_dump;
1534 	arg.skb = skb;
1535 	arg.cb = cb;
1536 	arg.w.stop  = 0;
1537 	arg.w.skip = cb->args[1];
1538 	arg.w.count = 0;
1539 	q->ops->cl_ops->walk(q, &arg.w);
1540 	cb->args[1] = arg.w.count;
1541 	if (arg.w.stop)
1542 		return -1;
1543 	(*t_p)++;
1544 	return 0;
1545 }
1546 
1547 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1548 			       struct tcmsg *tcm, struct netlink_callback *cb,
1549 			       int *t_p, int s_t)
1550 {
1551 	struct Qdisc *q;
1552 
1553 	if (!root)
1554 		return 0;
1555 
1556 	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1557 		return -1;
1558 
1559 	list_for_each_entry(q, &root->list, list) {
1560 		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1561 			return -1;
1562 	}
1563 
1564 	return 0;
1565 }
1566 
1567 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1568 {
1569 	struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1570 	struct net *net = sock_net(skb->sk);
1571 	struct netdev_queue *dev_queue;
1572 	struct net_device *dev;
1573 	int t, s_t;
1574 
1575 	if (net != &init_net)
1576 		return 0;
1577 
1578 	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1579 		return 0;
1580 	if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1581 		return 0;
1582 
1583 	s_t = cb->args[0];
1584 	t = 0;
1585 
1586 	dev_queue = netdev_get_tx_queue(dev, 0);
1587 	if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1588 		goto done;
1589 
1590 	dev_queue = &dev->rx_queue;
1591 	if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1592 		goto done;
1593 
1594 done:
1595 	cb->args[0] = t;
1596 
1597 	dev_put(dev);
1598 	return skb->len;
1599 }
1600 
1601 /* Main classifier routine: scans classifier chain attached
1602    to this qdisc, (optionally) tests for protocol and asks
1603    specific classifiers.
1604  */
1605 int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1606 		       struct tcf_result *res)
1607 {
1608 	__be16 protocol = skb->protocol;
1609 	int err = 0;
1610 
1611 	for (; tp; tp = tp->next) {
1612 		if ((tp->protocol == protocol ||
1613 		     tp->protocol == htons(ETH_P_ALL)) &&
1614 		    (err = tp->classify(skb, tp, res)) >= 0) {
1615 #ifdef CONFIG_NET_CLS_ACT
1616 			if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1617 				skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1618 #endif
1619 			return err;
1620 		}
1621 	}
1622 	return -1;
1623 }
1624 EXPORT_SYMBOL(tc_classify_compat);
1625 
1626 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1627 		struct tcf_result *res)
1628 {
1629 	int err = 0;
1630 	__be16 protocol;
1631 #ifdef CONFIG_NET_CLS_ACT
1632 	struct tcf_proto *otp = tp;
1633 reclassify:
1634 #endif
1635 	protocol = skb->protocol;
1636 
1637 	err = tc_classify_compat(skb, tp, res);
1638 #ifdef CONFIG_NET_CLS_ACT
1639 	if (err == TC_ACT_RECLASSIFY) {
1640 		u32 verd = G_TC_VERD(skb->tc_verd);
1641 		tp = otp;
1642 
1643 		if (verd++ >= MAX_REC_LOOP) {
1644 			printk("rule prio %u protocol %02x reclassify loop, "
1645 			       "packet dropped\n",
1646 			       tp->prio&0xffff, ntohs(tp->protocol));
1647 			return TC_ACT_SHOT;
1648 		}
1649 		skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1650 		goto reclassify;
1651 	}
1652 #endif
1653 	return err;
1654 }
1655 EXPORT_SYMBOL(tc_classify);
1656 
1657 void tcf_destroy(struct tcf_proto *tp)
1658 {
1659 	tp->ops->destroy(tp);
1660 	module_put(tp->ops->owner);
1661 	kfree(tp);
1662 }
1663 
1664 void tcf_destroy_chain(struct tcf_proto **fl)
1665 {
1666 	struct tcf_proto *tp;
1667 
1668 	while ((tp = *fl) != NULL) {
1669 		*fl = tp->next;
1670 		tcf_destroy(tp);
1671 	}
1672 }
1673 EXPORT_SYMBOL(tcf_destroy_chain);
1674 
1675 #ifdef CONFIG_PROC_FS
1676 static int psched_show(struct seq_file *seq, void *v)
1677 {
1678 	struct timespec ts;
1679 
1680 	hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1681 	seq_printf(seq, "%08x %08x %08x %08x\n",
1682 		   (u32)NSEC_PER_USEC, (u32)PSCHED_US2NS(1),
1683 		   1000000,
1684 		   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1685 
1686 	return 0;
1687 }
1688 
1689 static int psched_open(struct inode *inode, struct file *file)
1690 {
1691 	return single_open(file, psched_show, PDE(inode)->data);
1692 }
1693 
1694 static const struct file_operations psched_fops = {
1695 	.owner = THIS_MODULE,
1696 	.open = psched_open,
1697 	.read  = seq_read,
1698 	.llseek = seq_lseek,
1699 	.release = single_release,
1700 };
1701 #endif
1702 
1703 static int __init pktsched_init(void)
1704 {
1705 	register_qdisc(&pfifo_qdisc_ops);
1706 	register_qdisc(&bfifo_qdisc_ops);
1707 	proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
1708 
1709 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1710 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1711 	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1712 	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1713 	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1714 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1715 
1716 	return 0;
1717 }
1718 
1719 subsys_initcall(pktsched_init);
1720