xref: /linux/net/sched/sch_api.c (revision cc4589ebfae6f8dbb5cf880a0a67eedab3416492)
1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32 
33 #include <net/net_namespace.h>
34 #include <net/sock.h>
35 #include <net/netlink.h>
36 #include <net/pkt_sched.h>
37 
38 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
39 			struct nlmsghdr *n, u32 clid,
40 			struct Qdisc *old, struct Qdisc *new);
41 static int tclass_notify(struct net *net, struct sk_buff *oskb,
42 			 struct nlmsghdr *n, struct Qdisc *q,
43 			 unsigned long cl, int event);
44 
45 /*
46 
47    Short review.
48    -------------
49 
50    This file consists of two interrelated parts:
51 
52    1. queueing disciplines manager frontend.
53    2. traffic classes manager frontend.
54 
55    Generally, queueing discipline ("qdisc") is a black box,
56    which is able to enqueue packets and to dequeue them (when
57    device is ready to send something) in order and at times
58    determined by algorithm hidden in it.
59 
60    qdisc's are divided to two categories:
61    - "queues", which have no internal structure visible from outside.
62    - "schedulers", which split all the packets to "traffic classes",
63      using "packet classifiers" (look at cls_api.c)
64 
65    In turn, classes may have child qdiscs (as rule, queues)
66    attached to them etc. etc. etc.
67 
68    The goal of the routines in this file is to translate
69    information supplied by user in the form of handles
70    to more intelligible for kernel form, to make some sanity
71    checks and part of work, which is common to all qdiscs
72    and to provide rtnetlink notifications.
73 
74    All real intelligent work is done inside qdisc modules.
75 
76 
77 
78    Every discipline has two major routines: enqueue and dequeue.
79 
80    ---dequeue
81 
82    dequeue usually returns a skb to send. It is allowed to return NULL,
83    but it does not mean that queue is empty, it just means that
84    discipline does not want to send anything this time.
85    Queue is really empty if q->q.qlen == 0.
86    For complicated disciplines with multiple queues q->q is not
87    real packet queue, but however q->q.qlen must be valid.
88 
89    ---enqueue
90 
91    enqueue returns 0, if packet was enqueued successfully.
92    If packet (this one or another one) was dropped, it returns
93    not zero error code.
94    NET_XMIT_DROP 	- this packet dropped
95      Expected action: do not backoff, but wait until queue will clear.
96    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
97      Expected action: backoff or ignore
98    NET_XMIT_POLICED	- dropped by police.
99      Expected action: backoff or error to real-time apps.
100 
101    Auxiliary routines:
102 
103    ---peek
104 
105    like dequeue but without removing a packet from the queue
106 
107    ---reset
108 
109    returns qdisc to initial state: purge all buffers, clear all
110    timers, counters (except for statistics) etc.
111 
112    ---init
113 
114    initializes newly created qdisc.
115 
116    ---destroy
117 
118    destroys resources allocated by init and during lifetime of qdisc.
119 
120    ---change
121 
122    changes qdisc parameters.
123  */
124 
125 /* Protects list of registered TC modules. It is pure SMP lock. */
126 static DEFINE_RWLOCK(qdisc_mod_lock);
127 
128 
129 /************************************************
130  *	Queueing disciplines manipulation.	*
131  ************************************************/
132 
133 
134 /* The list of all installed queueing disciplines. */
135 
136 static struct Qdisc_ops *qdisc_base;
137 
138 /* Register/uregister queueing discipline */
139 
140 int register_qdisc(struct Qdisc_ops *qops)
141 {
142 	struct Qdisc_ops *q, **qp;
143 	int rc = -EEXIST;
144 
145 	write_lock(&qdisc_mod_lock);
146 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
147 		if (!strcmp(qops->id, q->id))
148 			goto out;
149 
150 	if (qops->enqueue == NULL)
151 		qops->enqueue = noop_qdisc_ops.enqueue;
152 	if (qops->peek == NULL) {
153 		if (qops->dequeue == NULL) {
154 			qops->peek = noop_qdisc_ops.peek;
155 		} else {
156 			rc = -EINVAL;
157 			goto out;
158 		}
159 	}
160 	if (qops->dequeue == NULL)
161 		qops->dequeue = noop_qdisc_ops.dequeue;
162 
163 	qops->next = NULL;
164 	*qp = qops;
165 	rc = 0;
166 out:
167 	write_unlock(&qdisc_mod_lock);
168 	return rc;
169 }
170 EXPORT_SYMBOL(register_qdisc);
171 
172 int unregister_qdisc(struct Qdisc_ops *qops)
173 {
174 	struct Qdisc_ops *q, **qp;
175 	int err = -ENOENT;
176 
177 	write_lock(&qdisc_mod_lock);
178 	for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
179 		if (q == qops)
180 			break;
181 	if (q) {
182 		*qp = q->next;
183 		q->next = NULL;
184 		err = 0;
185 	}
186 	write_unlock(&qdisc_mod_lock);
187 	return err;
188 }
189 EXPORT_SYMBOL(unregister_qdisc);
190 
191 /* We know handle. Find qdisc among all qdisc's attached to device
192    (root qdisc, all its children, children of children etc.)
193  */
194 
195 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
196 {
197 	struct Qdisc *q;
198 
199 	if (!(root->flags & TCQ_F_BUILTIN) &&
200 	    root->handle == handle)
201 		return root;
202 
203 	list_for_each_entry(q, &root->list, list) {
204 		if (q->handle == handle)
205 			return q;
206 	}
207 	return NULL;
208 }
209 
210 static void qdisc_list_add(struct Qdisc *q)
211 {
212 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
213 		list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
214 }
215 
216 void qdisc_list_del(struct Qdisc *q)
217 {
218 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
219 		list_del(&q->list);
220 }
221 EXPORT_SYMBOL(qdisc_list_del);
222 
223 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
224 {
225 	struct Qdisc *q;
226 
227 	q = qdisc_match_from_root(dev->qdisc, handle);
228 	if (q)
229 		goto out;
230 
231 	q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
232 out:
233 	return q;
234 }
235 
236 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
237 {
238 	unsigned long cl;
239 	struct Qdisc *leaf;
240 	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
241 
242 	if (cops == NULL)
243 		return NULL;
244 	cl = cops->get(p, classid);
245 
246 	if (cl == 0)
247 		return NULL;
248 	leaf = cops->leaf(p, cl);
249 	cops->put(p, cl);
250 	return leaf;
251 }
252 
253 /* Find queueing discipline by name */
254 
255 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
256 {
257 	struct Qdisc_ops *q = NULL;
258 
259 	if (kind) {
260 		read_lock(&qdisc_mod_lock);
261 		for (q = qdisc_base; q; q = q->next) {
262 			if (nla_strcmp(kind, q->id) == 0) {
263 				if (!try_module_get(q->owner))
264 					q = NULL;
265 				break;
266 			}
267 		}
268 		read_unlock(&qdisc_mod_lock);
269 	}
270 	return q;
271 }
272 
273 static struct qdisc_rate_table *qdisc_rtab_list;
274 
275 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
276 {
277 	struct qdisc_rate_table *rtab;
278 
279 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
280 		if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
281 			rtab->refcnt++;
282 			return rtab;
283 		}
284 	}
285 
286 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
287 	    nla_len(tab) != TC_RTAB_SIZE)
288 		return NULL;
289 
290 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
291 	if (rtab) {
292 		rtab->rate = *r;
293 		rtab->refcnt = 1;
294 		memcpy(rtab->data, nla_data(tab), 1024);
295 		rtab->next = qdisc_rtab_list;
296 		qdisc_rtab_list = rtab;
297 	}
298 	return rtab;
299 }
300 EXPORT_SYMBOL(qdisc_get_rtab);
301 
302 void qdisc_put_rtab(struct qdisc_rate_table *tab)
303 {
304 	struct qdisc_rate_table *rtab, **rtabp;
305 
306 	if (!tab || --tab->refcnt)
307 		return;
308 
309 	for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
310 		if (rtab == tab) {
311 			*rtabp = rtab->next;
312 			kfree(rtab);
313 			return;
314 		}
315 	}
316 }
317 EXPORT_SYMBOL(qdisc_put_rtab);
318 
319 static LIST_HEAD(qdisc_stab_list);
320 static DEFINE_SPINLOCK(qdisc_stab_lock);
321 
322 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
323 	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
324 	[TCA_STAB_DATA] = { .type = NLA_BINARY },
325 };
326 
327 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
328 {
329 	struct nlattr *tb[TCA_STAB_MAX + 1];
330 	struct qdisc_size_table *stab;
331 	struct tc_sizespec *s;
332 	unsigned int tsize = 0;
333 	u16 *tab = NULL;
334 	int err;
335 
336 	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
337 	if (err < 0)
338 		return ERR_PTR(err);
339 	if (!tb[TCA_STAB_BASE])
340 		return ERR_PTR(-EINVAL);
341 
342 	s = nla_data(tb[TCA_STAB_BASE]);
343 
344 	if (s->tsize > 0) {
345 		if (!tb[TCA_STAB_DATA])
346 			return ERR_PTR(-EINVAL);
347 		tab = nla_data(tb[TCA_STAB_DATA]);
348 		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
349 	}
350 
351 	if (!s || tsize != s->tsize || (!tab && tsize > 0))
352 		return ERR_PTR(-EINVAL);
353 
354 	spin_lock(&qdisc_stab_lock);
355 
356 	list_for_each_entry(stab, &qdisc_stab_list, list) {
357 		if (memcmp(&stab->szopts, s, sizeof(*s)))
358 			continue;
359 		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
360 			continue;
361 		stab->refcnt++;
362 		spin_unlock(&qdisc_stab_lock);
363 		return stab;
364 	}
365 
366 	spin_unlock(&qdisc_stab_lock);
367 
368 	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
369 	if (!stab)
370 		return ERR_PTR(-ENOMEM);
371 
372 	stab->refcnt = 1;
373 	stab->szopts = *s;
374 	if (tsize > 0)
375 		memcpy(stab->data, tab, tsize * sizeof(u16));
376 
377 	spin_lock(&qdisc_stab_lock);
378 	list_add_tail(&stab->list, &qdisc_stab_list);
379 	spin_unlock(&qdisc_stab_lock);
380 
381 	return stab;
382 }
383 
384 void qdisc_put_stab(struct qdisc_size_table *tab)
385 {
386 	if (!tab)
387 		return;
388 
389 	spin_lock(&qdisc_stab_lock);
390 
391 	if (--tab->refcnt == 0) {
392 		list_del(&tab->list);
393 		kfree(tab);
394 	}
395 
396 	spin_unlock(&qdisc_stab_lock);
397 }
398 EXPORT_SYMBOL(qdisc_put_stab);
399 
400 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
401 {
402 	struct nlattr *nest;
403 
404 	nest = nla_nest_start(skb, TCA_STAB);
405 	if (nest == NULL)
406 		goto nla_put_failure;
407 	NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
408 	nla_nest_end(skb, nest);
409 
410 	return skb->len;
411 
412 nla_put_failure:
413 	return -1;
414 }
415 
416 void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
417 {
418 	int pkt_len, slot;
419 
420 	pkt_len = skb->len + stab->szopts.overhead;
421 	if (unlikely(!stab->szopts.tsize))
422 		goto out;
423 
424 	slot = pkt_len + stab->szopts.cell_align;
425 	if (unlikely(slot < 0))
426 		slot = 0;
427 
428 	slot >>= stab->szopts.cell_log;
429 	if (likely(slot < stab->szopts.tsize))
430 		pkt_len = stab->data[slot];
431 	else
432 		pkt_len = stab->data[stab->szopts.tsize - 1] *
433 				(slot / stab->szopts.tsize) +
434 				stab->data[slot % stab->szopts.tsize];
435 
436 	pkt_len <<= stab->szopts.size_log;
437 out:
438 	if (unlikely(pkt_len < 1))
439 		pkt_len = 1;
440 	qdisc_skb_cb(skb)->pkt_len = pkt_len;
441 }
442 EXPORT_SYMBOL(qdisc_calculate_pkt_len);
443 
444 void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
445 {
446 	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
447 		printk(KERN_WARNING
448 		       "%s: %s qdisc %X: is non-work-conserving?\n",
449 		       txt, qdisc->ops->id, qdisc->handle >> 16);
450 		qdisc->flags |= TCQ_F_WARN_NONWC;
451 	}
452 }
453 EXPORT_SYMBOL(qdisc_warn_nonwc);
454 
455 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
456 {
457 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
458 						 timer);
459 
460 	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
461 	__netif_schedule(qdisc_root(wd->qdisc));
462 
463 	return HRTIMER_NORESTART;
464 }
465 
466 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
467 {
468 	hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
469 	wd->timer.function = qdisc_watchdog;
470 	wd->qdisc = qdisc;
471 }
472 EXPORT_SYMBOL(qdisc_watchdog_init);
473 
474 void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
475 {
476 	ktime_t time;
477 
478 	if (test_bit(__QDISC_STATE_DEACTIVATED,
479 		     &qdisc_root_sleeping(wd->qdisc)->state))
480 		return;
481 
482 	wd->qdisc->flags |= TCQ_F_THROTTLED;
483 	time = ktime_set(0, 0);
484 	time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
485 	hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
486 }
487 EXPORT_SYMBOL(qdisc_watchdog_schedule);
488 
489 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
490 {
491 	hrtimer_cancel(&wd->timer);
492 	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
493 }
494 EXPORT_SYMBOL(qdisc_watchdog_cancel);
495 
496 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
497 {
498 	unsigned int size = n * sizeof(struct hlist_head), i;
499 	struct hlist_head *h;
500 
501 	if (size <= PAGE_SIZE)
502 		h = kmalloc(size, GFP_KERNEL);
503 	else
504 		h = (struct hlist_head *)
505 			__get_free_pages(GFP_KERNEL, get_order(size));
506 
507 	if (h != NULL) {
508 		for (i = 0; i < n; i++)
509 			INIT_HLIST_HEAD(&h[i]);
510 	}
511 	return h;
512 }
513 
514 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
515 {
516 	unsigned int size = n * sizeof(struct hlist_head);
517 
518 	if (size <= PAGE_SIZE)
519 		kfree(h);
520 	else
521 		free_pages((unsigned long)h, get_order(size));
522 }
523 
524 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
525 {
526 	struct Qdisc_class_common *cl;
527 	struct hlist_node *n, *next;
528 	struct hlist_head *nhash, *ohash;
529 	unsigned int nsize, nmask, osize;
530 	unsigned int i, h;
531 
532 	/* Rehash when load factor exceeds 0.75 */
533 	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
534 		return;
535 	nsize = clhash->hashsize * 2;
536 	nmask = nsize - 1;
537 	nhash = qdisc_class_hash_alloc(nsize);
538 	if (nhash == NULL)
539 		return;
540 
541 	ohash = clhash->hash;
542 	osize = clhash->hashsize;
543 
544 	sch_tree_lock(sch);
545 	for (i = 0; i < osize; i++) {
546 		hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
547 			h = qdisc_class_hash(cl->classid, nmask);
548 			hlist_add_head(&cl->hnode, &nhash[h]);
549 		}
550 	}
551 	clhash->hash     = nhash;
552 	clhash->hashsize = nsize;
553 	clhash->hashmask = nmask;
554 	sch_tree_unlock(sch);
555 
556 	qdisc_class_hash_free(ohash, osize);
557 }
558 EXPORT_SYMBOL(qdisc_class_hash_grow);
559 
560 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
561 {
562 	unsigned int size = 4;
563 
564 	clhash->hash = qdisc_class_hash_alloc(size);
565 	if (clhash->hash == NULL)
566 		return -ENOMEM;
567 	clhash->hashsize  = size;
568 	clhash->hashmask  = size - 1;
569 	clhash->hashelems = 0;
570 	return 0;
571 }
572 EXPORT_SYMBOL(qdisc_class_hash_init);
573 
574 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
575 {
576 	qdisc_class_hash_free(clhash->hash, clhash->hashsize);
577 }
578 EXPORT_SYMBOL(qdisc_class_hash_destroy);
579 
580 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
581 			     struct Qdisc_class_common *cl)
582 {
583 	unsigned int h;
584 
585 	INIT_HLIST_NODE(&cl->hnode);
586 	h = qdisc_class_hash(cl->classid, clhash->hashmask);
587 	hlist_add_head(&cl->hnode, &clhash->hash[h]);
588 	clhash->hashelems++;
589 }
590 EXPORT_SYMBOL(qdisc_class_hash_insert);
591 
592 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
593 			     struct Qdisc_class_common *cl)
594 {
595 	hlist_del(&cl->hnode);
596 	clhash->hashelems--;
597 }
598 EXPORT_SYMBOL(qdisc_class_hash_remove);
599 
600 /* Allocate an unique handle from space managed by kernel */
601 
602 static u32 qdisc_alloc_handle(struct net_device *dev)
603 {
604 	int i = 0x10000;
605 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
606 
607 	do {
608 		autohandle += TC_H_MAKE(0x10000U, 0);
609 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
610 			autohandle = TC_H_MAKE(0x80000000U, 0);
611 	} while	(qdisc_lookup(dev, autohandle) && --i > 0);
612 
613 	return i>0 ? autohandle : 0;
614 }
615 
616 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
617 {
618 	const struct Qdisc_class_ops *cops;
619 	unsigned long cl;
620 	u32 parentid;
621 
622 	if (n == 0)
623 		return;
624 	while ((parentid = sch->parent)) {
625 		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
626 			return;
627 
628 		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
629 		if (sch == NULL) {
630 			WARN_ON(parentid != TC_H_ROOT);
631 			return;
632 		}
633 		cops = sch->ops->cl_ops;
634 		if (cops->qlen_notify) {
635 			cl = cops->get(sch, parentid);
636 			cops->qlen_notify(sch, cl);
637 			cops->put(sch, cl);
638 		}
639 		sch->q.qlen -= n;
640 	}
641 }
642 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
643 
644 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
645 			       struct nlmsghdr *n, u32 clid,
646 			       struct Qdisc *old, struct Qdisc *new)
647 {
648 	if (new || old)
649 		qdisc_notify(net, skb, n, clid, old, new);
650 
651 	if (old)
652 		qdisc_destroy(old);
653 }
654 
655 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
656  * to device "dev".
657  *
658  * When appropriate send a netlink notification using 'skb'
659  * and "n".
660  *
661  * On success, destroy old qdisc.
662  */
663 
664 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
665 		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
666 		       struct Qdisc *new, struct Qdisc *old)
667 {
668 	struct Qdisc *q = old;
669 	struct net *net = dev_net(dev);
670 	int err = 0;
671 
672 	if (parent == NULL) {
673 		unsigned int i, num_q, ingress;
674 
675 		ingress = 0;
676 		num_q = dev->num_tx_queues;
677 		if ((q && q->flags & TCQ_F_INGRESS) ||
678 		    (new && new->flags & TCQ_F_INGRESS)) {
679 			num_q = 1;
680 			ingress = 1;
681 		}
682 
683 		if (dev->flags & IFF_UP)
684 			dev_deactivate(dev);
685 
686 		if (new && new->ops->attach) {
687 			new->ops->attach(new);
688 			num_q = 0;
689 		}
690 
691 		for (i = 0; i < num_q; i++) {
692 			struct netdev_queue *dev_queue = &dev->rx_queue;
693 
694 			if (!ingress)
695 				dev_queue = netdev_get_tx_queue(dev, i);
696 
697 			old = dev_graft_qdisc(dev_queue, new);
698 			if (new && i > 0)
699 				atomic_inc(&new->refcnt);
700 
701 			if (!ingress)
702 				qdisc_destroy(old);
703 		}
704 
705 		if (!ingress) {
706 			notify_and_destroy(net, skb, n, classid,
707 					   dev->qdisc, new);
708 			if (new && !new->ops->attach)
709 				atomic_inc(&new->refcnt);
710 			dev->qdisc = new ? : &noop_qdisc;
711 		} else {
712 			notify_and_destroy(net, skb, n, classid, old, new);
713 		}
714 
715 		if (dev->flags & IFF_UP)
716 			dev_activate(dev);
717 	} else {
718 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
719 
720 		err = -EOPNOTSUPP;
721 		if (cops && cops->graft) {
722 			unsigned long cl = cops->get(parent, classid);
723 			if (cl) {
724 				err = cops->graft(parent, cl, new, &old);
725 				cops->put(parent, cl);
726 			} else
727 				err = -ENOENT;
728 		}
729 		if (!err)
730 			notify_and_destroy(net, skb, n, classid, old, new);
731 	}
732 	return err;
733 }
734 
735 /* lockdep annotation is needed for ingress; egress gets it only for name */
736 static struct lock_class_key qdisc_tx_lock;
737 static struct lock_class_key qdisc_rx_lock;
738 
739 /*
740    Allocate and initialize new qdisc.
741 
742    Parameters are passed via opt.
743  */
744 
745 static struct Qdisc *
746 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
747 	     struct Qdisc *p, u32 parent, u32 handle,
748 	     struct nlattr **tca, int *errp)
749 {
750 	int err;
751 	struct nlattr *kind = tca[TCA_KIND];
752 	struct Qdisc *sch;
753 	struct Qdisc_ops *ops;
754 	struct qdisc_size_table *stab;
755 
756 	ops = qdisc_lookup_ops(kind);
757 #ifdef CONFIG_MODULES
758 	if (ops == NULL && kind != NULL) {
759 		char name[IFNAMSIZ];
760 		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
761 			/* We dropped the RTNL semaphore in order to
762 			 * perform the module load.  So, even if we
763 			 * succeeded in loading the module we have to
764 			 * tell the caller to replay the request.  We
765 			 * indicate this using -EAGAIN.
766 			 * We replay the request because the device may
767 			 * go away in the mean time.
768 			 */
769 			rtnl_unlock();
770 			request_module("sch_%s", name);
771 			rtnl_lock();
772 			ops = qdisc_lookup_ops(kind);
773 			if (ops != NULL) {
774 				/* We will try again qdisc_lookup_ops,
775 				 * so don't keep a reference.
776 				 */
777 				module_put(ops->owner);
778 				err = -EAGAIN;
779 				goto err_out;
780 			}
781 		}
782 	}
783 #endif
784 
785 	err = -ENOENT;
786 	if (ops == NULL)
787 		goto err_out;
788 
789 	sch = qdisc_alloc(dev_queue, ops);
790 	if (IS_ERR(sch)) {
791 		err = PTR_ERR(sch);
792 		goto err_out2;
793 	}
794 
795 	sch->parent = parent;
796 
797 	if (handle == TC_H_INGRESS) {
798 		sch->flags |= TCQ_F_INGRESS;
799 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
800 		lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
801 	} else {
802 		if (handle == 0) {
803 			handle = qdisc_alloc_handle(dev);
804 			err = -ENOMEM;
805 			if (handle == 0)
806 				goto err_out3;
807 		}
808 		lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
809 	}
810 
811 	sch->handle = handle;
812 
813 	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
814 		if (tca[TCA_STAB]) {
815 			stab = qdisc_get_stab(tca[TCA_STAB]);
816 			if (IS_ERR(stab)) {
817 				err = PTR_ERR(stab);
818 				goto err_out4;
819 			}
820 			sch->stab = stab;
821 		}
822 		if (tca[TCA_RATE]) {
823 			spinlock_t *root_lock;
824 
825 			err = -EOPNOTSUPP;
826 			if (sch->flags & TCQ_F_MQROOT)
827 				goto err_out4;
828 
829 			if ((sch->parent != TC_H_ROOT) &&
830 			    !(sch->flags & TCQ_F_INGRESS) &&
831 			    (!p || !(p->flags & TCQ_F_MQROOT)))
832 				root_lock = qdisc_root_sleeping_lock(sch);
833 			else
834 				root_lock = qdisc_lock(sch);
835 
836 			err = gen_new_estimator(&sch->bstats, &sch->rate_est,
837 						root_lock, tca[TCA_RATE]);
838 			if (err)
839 				goto err_out4;
840 		}
841 
842 		qdisc_list_add(sch);
843 
844 		return sch;
845 	}
846 err_out3:
847 	dev_put(dev);
848 	kfree((char *) sch - sch->padded);
849 err_out2:
850 	module_put(ops->owner);
851 err_out:
852 	*errp = err;
853 	return NULL;
854 
855 err_out4:
856 	/*
857 	 * Any broken qdiscs that would require a ops->reset() here?
858 	 * The qdisc was never in action so it shouldn't be necessary.
859 	 */
860 	qdisc_put_stab(sch->stab);
861 	if (ops->destroy)
862 		ops->destroy(sch);
863 	goto err_out3;
864 }
865 
866 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
867 {
868 	struct qdisc_size_table *stab = NULL;
869 	int err = 0;
870 
871 	if (tca[TCA_OPTIONS]) {
872 		if (sch->ops->change == NULL)
873 			return -EINVAL;
874 		err = sch->ops->change(sch, tca[TCA_OPTIONS]);
875 		if (err)
876 			return err;
877 	}
878 
879 	if (tca[TCA_STAB]) {
880 		stab = qdisc_get_stab(tca[TCA_STAB]);
881 		if (IS_ERR(stab))
882 			return PTR_ERR(stab);
883 	}
884 
885 	qdisc_put_stab(sch->stab);
886 	sch->stab = stab;
887 
888 	if (tca[TCA_RATE]) {
889 		/* NB: ignores errors from replace_estimator
890 		   because change can't be undone. */
891 		if (sch->flags & TCQ_F_MQROOT)
892 			goto out;
893 		gen_replace_estimator(&sch->bstats, &sch->rate_est,
894 					    qdisc_root_sleeping_lock(sch),
895 					    tca[TCA_RATE]);
896 	}
897 out:
898 	return 0;
899 }
900 
901 struct check_loop_arg
902 {
903 	struct qdisc_walker 	w;
904 	struct Qdisc		*p;
905 	int			depth;
906 };
907 
908 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
909 
910 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
911 {
912 	struct check_loop_arg	arg;
913 
914 	if (q->ops->cl_ops == NULL)
915 		return 0;
916 
917 	arg.w.stop = arg.w.skip = arg.w.count = 0;
918 	arg.w.fn = check_loop_fn;
919 	arg.depth = depth;
920 	arg.p = p;
921 	q->ops->cl_ops->walk(q, &arg.w);
922 	return arg.w.stop ? -ELOOP : 0;
923 }
924 
925 static int
926 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
927 {
928 	struct Qdisc *leaf;
929 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
930 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
931 
932 	leaf = cops->leaf(q, cl);
933 	if (leaf) {
934 		if (leaf == arg->p || arg->depth > 7)
935 			return -ELOOP;
936 		return check_loop(leaf, arg->p, arg->depth + 1);
937 	}
938 	return 0;
939 }
940 
941 /*
942  * Delete/get qdisc.
943  */
944 
945 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
946 {
947 	struct net *net = sock_net(skb->sk);
948 	struct tcmsg *tcm = NLMSG_DATA(n);
949 	struct nlattr *tca[TCA_MAX + 1];
950 	struct net_device *dev;
951 	u32 clid = tcm->tcm_parent;
952 	struct Qdisc *q = NULL;
953 	struct Qdisc *p = NULL;
954 	int err;
955 
956 	if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
957 		return -ENODEV;
958 
959 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
960 	if (err < 0)
961 		return err;
962 
963 	if (clid) {
964 		if (clid != TC_H_ROOT) {
965 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
966 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
967 					return -ENOENT;
968 				q = qdisc_leaf(p, clid);
969 			} else { /* ingress */
970 				q = dev->rx_queue.qdisc_sleeping;
971 			}
972 		} else {
973 			q = dev->qdisc;
974 		}
975 		if (!q)
976 			return -ENOENT;
977 
978 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
979 			return -EINVAL;
980 	} else {
981 		if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
982 			return -ENOENT;
983 	}
984 
985 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
986 		return -EINVAL;
987 
988 	if (n->nlmsg_type == RTM_DELQDISC) {
989 		if (!clid)
990 			return -EINVAL;
991 		if (q->handle == 0)
992 			return -ENOENT;
993 		if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
994 			return err;
995 	} else {
996 		qdisc_notify(net, skb, n, clid, NULL, q);
997 	}
998 	return 0;
999 }
1000 
1001 /*
1002    Create/change qdisc.
1003  */
1004 
1005 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1006 {
1007 	struct net *net = sock_net(skb->sk);
1008 	struct tcmsg *tcm;
1009 	struct nlattr *tca[TCA_MAX + 1];
1010 	struct net_device *dev;
1011 	u32 clid;
1012 	struct Qdisc *q, *p;
1013 	int err;
1014 
1015 replay:
1016 	/* Reinit, just in case something touches this. */
1017 	tcm = NLMSG_DATA(n);
1018 	clid = tcm->tcm_parent;
1019 	q = p = NULL;
1020 
1021 	if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
1022 		return -ENODEV;
1023 
1024 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1025 	if (err < 0)
1026 		return err;
1027 
1028 	if (clid) {
1029 		if (clid != TC_H_ROOT) {
1030 			if (clid != TC_H_INGRESS) {
1031 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
1032 					return -ENOENT;
1033 				q = qdisc_leaf(p, clid);
1034 			} else { /*ingress */
1035 				q = dev->rx_queue.qdisc_sleeping;
1036 			}
1037 		} else {
1038 			q = dev->qdisc;
1039 		}
1040 
1041 		/* It may be default qdisc, ignore it */
1042 		if (q && q->handle == 0)
1043 			q = NULL;
1044 
1045 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1046 			if (tcm->tcm_handle) {
1047 				if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
1048 					return -EEXIST;
1049 				if (TC_H_MIN(tcm->tcm_handle))
1050 					return -EINVAL;
1051 				if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1052 					goto create_n_graft;
1053 				if (n->nlmsg_flags&NLM_F_EXCL)
1054 					return -EEXIST;
1055 				if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1056 					return -EINVAL;
1057 				if (q == p ||
1058 				    (p && check_loop(q, p, 0)))
1059 					return -ELOOP;
1060 				atomic_inc(&q->refcnt);
1061 				goto graft;
1062 			} else {
1063 				if (q == NULL)
1064 					goto create_n_graft;
1065 
1066 				/* This magic test requires explanation.
1067 				 *
1068 				 *   We know, that some child q is already
1069 				 *   attached to this parent and have choice:
1070 				 *   either to change it or to create/graft new one.
1071 				 *
1072 				 *   1. We are allowed to create/graft only
1073 				 *   if CREATE and REPLACE flags are set.
1074 				 *
1075 				 *   2. If EXCL is set, requestor wanted to say,
1076 				 *   that qdisc tcm_handle is not expected
1077 				 *   to exist, so that we choose create/graft too.
1078 				 *
1079 				 *   3. The last case is when no flags are set.
1080 				 *   Alas, it is sort of hole in API, we
1081 				 *   cannot decide what to do unambiguously.
1082 				 *   For now we select create/graft, if
1083 				 *   user gave KIND, which does not match existing.
1084 				 */
1085 				if ((n->nlmsg_flags&NLM_F_CREATE) &&
1086 				    (n->nlmsg_flags&NLM_F_REPLACE) &&
1087 				    ((n->nlmsg_flags&NLM_F_EXCL) ||
1088 				     (tca[TCA_KIND] &&
1089 				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1090 					goto create_n_graft;
1091 			}
1092 		}
1093 	} else {
1094 		if (!tcm->tcm_handle)
1095 			return -EINVAL;
1096 		q = qdisc_lookup(dev, tcm->tcm_handle);
1097 	}
1098 
1099 	/* Change qdisc parameters */
1100 	if (q == NULL)
1101 		return -ENOENT;
1102 	if (n->nlmsg_flags&NLM_F_EXCL)
1103 		return -EEXIST;
1104 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1105 		return -EINVAL;
1106 	err = qdisc_change(q, tca);
1107 	if (err == 0)
1108 		qdisc_notify(net, skb, n, clid, NULL, q);
1109 	return err;
1110 
1111 create_n_graft:
1112 	if (!(n->nlmsg_flags&NLM_F_CREATE))
1113 		return -ENOENT;
1114 	if (clid == TC_H_INGRESS)
1115 		q = qdisc_create(dev, &dev->rx_queue, p,
1116 				 tcm->tcm_parent, tcm->tcm_parent,
1117 				 tca, &err);
1118 	else {
1119 		struct netdev_queue *dev_queue;
1120 
1121 		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1122 			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1123 		else if (p)
1124 			dev_queue = p->dev_queue;
1125 		else
1126 			dev_queue = netdev_get_tx_queue(dev, 0);
1127 
1128 		q = qdisc_create(dev, dev_queue, p,
1129 				 tcm->tcm_parent, tcm->tcm_handle,
1130 				 tca, &err);
1131 	}
1132 	if (q == NULL) {
1133 		if (err == -EAGAIN)
1134 			goto replay;
1135 		return err;
1136 	}
1137 
1138 graft:
1139 	err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1140 	if (err) {
1141 		if (q)
1142 			qdisc_destroy(q);
1143 		return err;
1144 	}
1145 
1146 	return 0;
1147 }
1148 
1149 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1150 			 u32 pid, u32 seq, u16 flags, int event)
1151 {
1152 	struct tcmsg *tcm;
1153 	struct nlmsghdr  *nlh;
1154 	unsigned char *b = skb_tail_pointer(skb);
1155 	struct gnet_dump d;
1156 
1157 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1158 	tcm = NLMSG_DATA(nlh);
1159 	tcm->tcm_family = AF_UNSPEC;
1160 	tcm->tcm__pad1 = 0;
1161 	tcm->tcm__pad2 = 0;
1162 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1163 	tcm->tcm_parent = clid;
1164 	tcm->tcm_handle = q->handle;
1165 	tcm->tcm_info = atomic_read(&q->refcnt);
1166 	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1167 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
1168 		goto nla_put_failure;
1169 	q->qstats.qlen = q->q.qlen;
1170 
1171 	if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
1172 		goto nla_put_failure;
1173 
1174 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1175 					 qdisc_root_sleeping_lock(q), &d) < 0)
1176 		goto nla_put_failure;
1177 
1178 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1179 		goto nla_put_failure;
1180 
1181 	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1182 	    gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1183 	    gnet_stats_copy_queue(&d, &q->qstats) < 0)
1184 		goto nla_put_failure;
1185 
1186 	if (gnet_stats_finish_copy(&d) < 0)
1187 		goto nla_put_failure;
1188 
1189 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1190 	return skb->len;
1191 
1192 nlmsg_failure:
1193 nla_put_failure:
1194 	nlmsg_trim(skb, b);
1195 	return -1;
1196 }
1197 
1198 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1199 {
1200 	return (q->flags & TCQ_F_BUILTIN) ? true : false;
1201 }
1202 
1203 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1204 			struct nlmsghdr *n, u32 clid,
1205 			struct Qdisc *old, struct Qdisc *new)
1206 {
1207 	struct sk_buff *skb;
1208 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1209 
1210 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1211 	if (!skb)
1212 		return -ENOBUFS;
1213 
1214 	if (old && !tc_qdisc_dump_ignore(old)) {
1215 		if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1216 			goto err_out;
1217 	}
1218 	if (new && !tc_qdisc_dump_ignore(new)) {
1219 		if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1220 			goto err_out;
1221 	}
1222 
1223 	if (skb->len)
1224 		return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1225 
1226 err_out:
1227 	kfree_skb(skb);
1228 	return -EINVAL;
1229 }
1230 
1231 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1232 			      struct netlink_callback *cb,
1233 			      int *q_idx_p, int s_q_idx)
1234 {
1235 	int ret = 0, q_idx = *q_idx_p;
1236 	struct Qdisc *q;
1237 
1238 	if (!root)
1239 		return 0;
1240 
1241 	q = root;
1242 	if (q_idx < s_q_idx) {
1243 		q_idx++;
1244 	} else {
1245 		if (!tc_qdisc_dump_ignore(q) &&
1246 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1247 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1248 			goto done;
1249 		q_idx++;
1250 	}
1251 	list_for_each_entry(q, &root->list, list) {
1252 		if (q_idx < s_q_idx) {
1253 			q_idx++;
1254 			continue;
1255 		}
1256 		if (!tc_qdisc_dump_ignore(q) &&
1257 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1258 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1259 			goto done;
1260 		q_idx++;
1261 	}
1262 
1263 out:
1264 	*q_idx_p = q_idx;
1265 	return ret;
1266 done:
1267 	ret = -1;
1268 	goto out;
1269 }
1270 
1271 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1272 {
1273 	struct net *net = sock_net(skb->sk);
1274 	int idx, q_idx;
1275 	int s_idx, s_q_idx;
1276 	struct net_device *dev;
1277 
1278 	s_idx = cb->args[0];
1279 	s_q_idx = q_idx = cb->args[1];
1280 
1281 	rcu_read_lock();
1282 	idx = 0;
1283 	for_each_netdev_rcu(net, dev) {
1284 		struct netdev_queue *dev_queue;
1285 
1286 		if (idx < s_idx)
1287 			goto cont;
1288 		if (idx > s_idx)
1289 			s_q_idx = 0;
1290 		q_idx = 0;
1291 
1292 		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1293 			goto done;
1294 
1295 		dev_queue = &dev->rx_queue;
1296 		if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1297 			goto done;
1298 
1299 cont:
1300 		idx++;
1301 	}
1302 
1303 done:
1304 	rcu_read_unlock();
1305 
1306 	cb->args[0] = idx;
1307 	cb->args[1] = q_idx;
1308 
1309 	return skb->len;
1310 }
1311 
1312 
1313 
1314 /************************************************
1315  *	Traffic classes manipulation.		*
1316  ************************************************/
1317 
1318 
1319 
1320 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1321 {
1322 	struct net *net = sock_net(skb->sk);
1323 	struct tcmsg *tcm = NLMSG_DATA(n);
1324 	struct nlattr *tca[TCA_MAX + 1];
1325 	struct net_device *dev;
1326 	struct Qdisc *q = NULL;
1327 	const struct Qdisc_class_ops *cops;
1328 	unsigned long cl = 0;
1329 	unsigned long new_cl;
1330 	u32 pid = tcm->tcm_parent;
1331 	u32 clid = tcm->tcm_handle;
1332 	u32 qid = TC_H_MAJ(clid);
1333 	int err;
1334 
1335 	if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
1336 		return -ENODEV;
1337 
1338 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1339 	if (err < 0)
1340 		return err;
1341 
1342 	/*
1343 	   parent == TC_H_UNSPEC - unspecified parent.
1344 	   parent == TC_H_ROOT   - class is root, which has no parent.
1345 	   parent == X:0	 - parent is root class.
1346 	   parent == X:Y	 - parent is a node in hierarchy.
1347 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
1348 
1349 	   handle == 0:0	 - generate handle from kernel pool.
1350 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
1351 	   handle == X:Y	 - clear.
1352 	   handle == X:0	 - root class.
1353 	 */
1354 
1355 	/* Step 1. Determine qdisc handle X:0 */
1356 
1357 	if (pid != TC_H_ROOT) {
1358 		u32 qid1 = TC_H_MAJ(pid);
1359 
1360 		if (qid && qid1) {
1361 			/* If both majors are known, they must be identical. */
1362 			if (qid != qid1)
1363 				return -EINVAL;
1364 		} else if (qid1) {
1365 			qid = qid1;
1366 		} else if (qid == 0)
1367 			qid = dev->qdisc->handle;
1368 
1369 		/* Now qid is genuine qdisc handle consistent
1370 		   both with parent and child.
1371 
1372 		   TC_H_MAJ(pid) still may be unspecified, complete it now.
1373 		 */
1374 		if (pid)
1375 			pid = TC_H_MAKE(qid, pid);
1376 	} else {
1377 		if (qid == 0)
1378 			qid = dev->qdisc->handle;
1379 	}
1380 
1381 	/* OK. Locate qdisc */
1382 	if ((q = qdisc_lookup(dev, qid)) == NULL)
1383 		return -ENOENT;
1384 
1385 	/* An check that it supports classes */
1386 	cops = q->ops->cl_ops;
1387 	if (cops == NULL)
1388 		return -EINVAL;
1389 
1390 	/* Now try to get class */
1391 	if (clid == 0) {
1392 		if (pid == TC_H_ROOT)
1393 			clid = qid;
1394 	} else
1395 		clid = TC_H_MAKE(qid, clid);
1396 
1397 	if (clid)
1398 		cl = cops->get(q, clid);
1399 
1400 	if (cl == 0) {
1401 		err = -ENOENT;
1402 		if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1403 			goto out;
1404 	} else {
1405 		switch (n->nlmsg_type) {
1406 		case RTM_NEWTCLASS:
1407 			err = -EEXIST;
1408 			if (n->nlmsg_flags&NLM_F_EXCL)
1409 				goto out;
1410 			break;
1411 		case RTM_DELTCLASS:
1412 			err = -EOPNOTSUPP;
1413 			if (cops->delete)
1414 				err = cops->delete(q, cl);
1415 			if (err == 0)
1416 				tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1417 			goto out;
1418 		case RTM_GETTCLASS:
1419 			err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1420 			goto out;
1421 		default:
1422 			err = -EINVAL;
1423 			goto out;
1424 		}
1425 	}
1426 
1427 	new_cl = cl;
1428 	err = -EOPNOTSUPP;
1429 	if (cops->change)
1430 		err = cops->change(q, clid, pid, tca, &new_cl);
1431 	if (err == 0)
1432 		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1433 
1434 out:
1435 	if (cl)
1436 		cops->put(q, cl);
1437 
1438 	return err;
1439 }
1440 
1441 
1442 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1443 			  unsigned long cl,
1444 			  u32 pid, u32 seq, u16 flags, int event)
1445 {
1446 	struct tcmsg *tcm;
1447 	struct nlmsghdr  *nlh;
1448 	unsigned char *b = skb_tail_pointer(skb);
1449 	struct gnet_dump d;
1450 	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1451 
1452 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1453 	tcm = NLMSG_DATA(nlh);
1454 	tcm->tcm_family = AF_UNSPEC;
1455 	tcm->tcm__pad1 = 0;
1456 	tcm->tcm__pad2 = 0;
1457 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1458 	tcm->tcm_parent = q->handle;
1459 	tcm->tcm_handle = q->handle;
1460 	tcm->tcm_info = 0;
1461 	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1462 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1463 		goto nla_put_failure;
1464 
1465 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1466 					 qdisc_root_sleeping_lock(q), &d) < 0)
1467 		goto nla_put_failure;
1468 
1469 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1470 		goto nla_put_failure;
1471 
1472 	if (gnet_stats_finish_copy(&d) < 0)
1473 		goto nla_put_failure;
1474 
1475 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1476 	return skb->len;
1477 
1478 nlmsg_failure:
1479 nla_put_failure:
1480 	nlmsg_trim(skb, b);
1481 	return -1;
1482 }
1483 
1484 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1485 			 struct nlmsghdr *n, struct Qdisc *q,
1486 			 unsigned long cl, int event)
1487 {
1488 	struct sk_buff *skb;
1489 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1490 
1491 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1492 	if (!skb)
1493 		return -ENOBUFS;
1494 
1495 	if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1496 		kfree_skb(skb);
1497 		return -EINVAL;
1498 	}
1499 
1500 	return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1501 }
1502 
1503 struct qdisc_dump_args
1504 {
1505 	struct qdisc_walker w;
1506 	struct sk_buff *skb;
1507 	struct netlink_callback *cb;
1508 };
1509 
1510 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1511 {
1512 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1513 
1514 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1515 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1516 }
1517 
1518 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1519 				struct tcmsg *tcm, struct netlink_callback *cb,
1520 				int *t_p, int s_t)
1521 {
1522 	struct qdisc_dump_args arg;
1523 
1524 	if (tc_qdisc_dump_ignore(q) ||
1525 	    *t_p < s_t || !q->ops->cl_ops ||
1526 	    (tcm->tcm_parent &&
1527 	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1528 		(*t_p)++;
1529 		return 0;
1530 	}
1531 	if (*t_p > s_t)
1532 		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1533 	arg.w.fn = qdisc_class_dump;
1534 	arg.skb = skb;
1535 	arg.cb = cb;
1536 	arg.w.stop  = 0;
1537 	arg.w.skip = cb->args[1];
1538 	arg.w.count = 0;
1539 	q->ops->cl_ops->walk(q, &arg.w);
1540 	cb->args[1] = arg.w.count;
1541 	if (arg.w.stop)
1542 		return -1;
1543 	(*t_p)++;
1544 	return 0;
1545 }
1546 
1547 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1548 			       struct tcmsg *tcm, struct netlink_callback *cb,
1549 			       int *t_p, int s_t)
1550 {
1551 	struct Qdisc *q;
1552 
1553 	if (!root)
1554 		return 0;
1555 
1556 	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1557 		return -1;
1558 
1559 	list_for_each_entry(q, &root->list, list) {
1560 		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1561 			return -1;
1562 	}
1563 
1564 	return 0;
1565 }
1566 
1567 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1568 {
1569 	struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1570 	struct net *net = sock_net(skb->sk);
1571 	struct netdev_queue *dev_queue;
1572 	struct net_device *dev;
1573 	int t, s_t;
1574 
1575 	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1576 		return 0;
1577 	if ((dev = dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
1578 		return 0;
1579 
1580 	s_t = cb->args[0];
1581 	t = 0;
1582 
1583 	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1584 		goto done;
1585 
1586 	dev_queue = &dev->rx_queue;
1587 	if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1588 		goto done;
1589 
1590 done:
1591 	cb->args[0] = t;
1592 
1593 	dev_put(dev);
1594 	return skb->len;
1595 }
1596 
1597 /* Main classifier routine: scans classifier chain attached
1598    to this qdisc, (optionally) tests for protocol and asks
1599    specific classifiers.
1600  */
1601 int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1602 		       struct tcf_result *res)
1603 {
1604 	__be16 protocol = skb->protocol;
1605 	int err = 0;
1606 
1607 	for (; tp; tp = tp->next) {
1608 		if ((tp->protocol == protocol ||
1609 		     tp->protocol == htons(ETH_P_ALL)) &&
1610 		    (err = tp->classify(skb, tp, res)) >= 0) {
1611 #ifdef CONFIG_NET_CLS_ACT
1612 			if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1613 				skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1614 #endif
1615 			return err;
1616 		}
1617 	}
1618 	return -1;
1619 }
1620 EXPORT_SYMBOL(tc_classify_compat);
1621 
1622 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1623 		struct tcf_result *res)
1624 {
1625 	int err = 0;
1626 	__be16 protocol;
1627 #ifdef CONFIG_NET_CLS_ACT
1628 	struct tcf_proto *otp = tp;
1629 reclassify:
1630 #endif
1631 	protocol = skb->protocol;
1632 
1633 	err = tc_classify_compat(skb, tp, res);
1634 #ifdef CONFIG_NET_CLS_ACT
1635 	if (err == TC_ACT_RECLASSIFY) {
1636 		u32 verd = G_TC_VERD(skb->tc_verd);
1637 		tp = otp;
1638 
1639 		if (verd++ >= MAX_REC_LOOP) {
1640 			if (net_ratelimit())
1641 				printk(KERN_NOTICE
1642 				       "%s: packet reclassify loop"
1643 					  " rule prio %u protocol %02x\n",
1644 				       tp->q->ops->id,
1645 				       tp->prio & 0xffff, ntohs(tp->protocol));
1646 			return TC_ACT_SHOT;
1647 		}
1648 		skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1649 		goto reclassify;
1650 	}
1651 #endif
1652 	return err;
1653 }
1654 EXPORT_SYMBOL(tc_classify);
1655 
1656 void tcf_destroy(struct tcf_proto *tp)
1657 {
1658 	tp->ops->destroy(tp);
1659 	module_put(tp->ops->owner);
1660 	kfree(tp);
1661 }
1662 
1663 void tcf_destroy_chain(struct tcf_proto **fl)
1664 {
1665 	struct tcf_proto *tp;
1666 
1667 	while ((tp = *fl) != NULL) {
1668 		*fl = tp->next;
1669 		tcf_destroy(tp);
1670 	}
1671 }
1672 EXPORT_SYMBOL(tcf_destroy_chain);
1673 
1674 #ifdef CONFIG_PROC_FS
1675 static int psched_show(struct seq_file *seq, void *v)
1676 {
1677 	struct timespec ts;
1678 
1679 	hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1680 	seq_printf(seq, "%08x %08x %08x %08x\n",
1681 		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1682 		   1000000,
1683 		   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1684 
1685 	return 0;
1686 }
1687 
1688 static int psched_open(struct inode *inode, struct file *file)
1689 {
1690 	return single_open(file, psched_show, NULL);
1691 }
1692 
1693 static const struct file_operations psched_fops = {
1694 	.owner = THIS_MODULE,
1695 	.open = psched_open,
1696 	.read  = seq_read,
1697 	.llseek = seq_lseek,
1698 	.release = single_release,
1699 };
1700 
1701 static int __net_init psched_net_init(struct net *net)
1702 {
1703 	struct proc_dir_entry *e;
1704 
1705 	e = proc_net_fops_create(net, "psched", 0, &psched_fops);
1706 	if (e == NULL)
1707 		return -ENOMEM;
1708 
1709 	return 0;
1710 }
1711 
1712 static void __net_exit psched_net_exit(struct net *net)
1713 {
1714 	proc_net_remove(net, "psched");
1715 }
1716 #else
1717 static int __net_init psched_net_init(struct net *net)
1718 {
1719 	return 0;
1720 }
1721 
1722 static void __net_exit psched_net_exit(struct net *net)
1723 {
1724 }
1725 #endif
1726 
1727 static struct pernet_operations psched_net_ops = {
1728 	.init = psched_net_init,
1729 	.exit = psched_net_exit,
1730 };
1731 
1732 static int __init pktsched_init(void)
1733 {
1734 	int err;
1735 
1736 	err = register_pernet_subsys(&psched_net_ops);
1737 	if (err) {
1738 		printk(KERN_ERR "pktsched_init: "
1739 		       "cannot initialize per netns operations\n");
1740 		return err;
1741 	}
1742 
1743 	register_qdisc(&pfifo_qdisc_ops);
1744 	register_qdisc(&bfifo_qdisc_ops);
1745 	register_qdisc(&pfifo_head_drop_qdisc_ops);
1746 	register_qdisc(&mq_qdisc_ops);
1747 
1748 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1749 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1750 	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1751 	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1752 	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1753 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1754 
1755 	return 0;
1756 }
1757 
1758 subsys_initcall(pktsched_init);
1759