xref: /linux/net/sched/sch_generic.c (revision 5e8d780d745c1619aba81fe7166c5a4b5cad2b84)
1 /*
2  * net/sched/sch_generic.c	Generic packet scheduler routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11  *              - Ingress support
12  */
13 
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/config.h>
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/sched.h>
22 #include <linux/string.h>
23 #include <linux/mm.h>
24 #include <linux/socket.h>
25 #include <linux/sockios.h>
26 #include <linux/in.h>
27 #include <linux/errno.h>
28 #include <linux/interrupt.h>
29 #include <linux/netdevice.h>
30 #include <linux/skbuff.h>
31 #include <linux/rtnetlink.h>
32 #include <linux/init.h>
33 #include <linux/rcupdate.h>
34 #include <linux/list.h>
35 #include <net/sock.h>
36 #include <net/pkt_sched.h>
37 
38 /* Main transmission queue. */
39 
40 /* Main qdisc structure lock.
41 
42    However, modifications
43    to data, participating in scheduling must be additionally
44    protected with dev->queue_lock spinlock.
45 
46    The idea is the following:
47    - enqueue, dequeue are serialized via top level device
48      spinlock dev->queue_lock.
49    - tree walking is protected by read_lock_bh(qdisc_tree_lock)
50      and this lock is used only in process context.
51    - updates to tree are made under rtnl semaphore or
52      from softirq context (__qdisc_destroy rcu-callback)
53      hence this lock needs local bh disabling.
54 
55    qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
56  */
57 DEFINE_RWLOCK(qdisc_tree_lock);
58 
59 void qdisc_lock_tree(struct net_device *dev)
60 {
61 	write_lock_bh(&qdisc_tree_lock);
62 	spin_lock_bh(&dev->queue_lock);
63 }
64 
65 void qdisc_unlock_tree(struct net_device *dev)
66 {
67 	spin_unlock_bh(&dev->queue_lock);
68 	write_unlock_bh(&qdisc_tree_lock);
69 }
70 
71 /*
72    dev->queue_lock serializes queue accesses for this device
73    AND dev->qdisc pointer itself.
74 
75    netif_tx_lock serializes accesses to device driver.
76 
77    dev->queue_lock and netif_tx_lock are mutually exclusive,
78    if one is grabbed, another must be free.
79  */
80 
81 
82 /* Kick device.
83    Note, that this procedure can be called by a watchdog timer, so that
84    we do not check dev->tbusy flag here.
85 
86    Returns:  0  - queue is empty.
87             >0  - queue is not empty, but throttled.
88 	    <0  - queue is not empty. Device is throttled, if dev->tbusy != 0.
89 
90    NOTE: Called under dev->queue_lock with locally disabled BH.
91 */
92 
93 static inline int qdisc_restart(struct net_device *dev)
94 {
95 	struct Qdisc *q = dev->qdisc;
96 	struct sk_buff *skb;
97 
98 	/* Dequeue packet */
99 	if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
100 		unsigned nolock = (dev->features & NETIF_F_LLTX);
101 
102 		dev->gso_skb = NULL;
103 
104 		/*
105 		 * When the driver has LLTX set it does its own locking
106 		 * in start_xmit. No need to add additional overhead by
107 		 * locking again. These checks are worth it because
108 		 * even uncongested locks can be quite expensive.
109 		 * The driver can do trylock like here too, in case
110 		 * of lock congestion it should return -1 and the packet
111 		 * will be requeued.
112 		 */
113 		if (!nolock) {
114 			if (!netif_tx_trylock(dev)) {
115 			collision:
116 				/* So, someone grabbed the driver. */
117 
118 				/* It may be transient configuration error,
119 				   when hard_start_xmit() recurses. We detect
120 				   it by checking xmit owner and drop the
121 				   packet when deadloop is detected.
122 				*/
123 				if (dev->xmit_lock_owner == smp_processor_id()) {
124 					kfree_skb(skb);
125 					if (net_ratelimit())
126 						printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
127 					return -1;
128 				}
129 				__get_cpu_var(netdev_rx_stat).cpu_collision++;
130 				goto requeue;
131 			}
132 		}
133 
134 		{
135 			/* And release queue */
136 			spin_unlock(&dev->queue_lock);
137 
138 			if (!netif_queue_stopped(dev)) {
139 				int ret;
140 
141 				ret = dev_hard_start_xmit(skb, dev);
142 				if (ret == NETDEV_TX_OK) {
143 					if (!nolock) {
144 						netif_tx_unlock(dev);
145 					}
146 					spin_lock(&dev->queue_lock);
147 					return -1;
148 				}
149 				if (ret == NETDEV_TX_LOCKED && nolock) {
150 					spin_lock(&dev->queue_lock);
151 					goto collision;
152 				}
153 			}
154 
155 			/* NETDEV_TX_BUSY - we need to requeue */
156 			/* Release the driver */
157 			if (!nolock) {
158 				netif_tx_unlock(dev);
159 			}
160 			spin_lock(&dev->queue_lock);
161 			q = dev->qdisc;
162 		}
163 
164 		/* Device kicked us out :(
165 		   This is possible in three cases:
166 
167 		   0. driver is locked
168 		   1. fastroute is enabled
169 		   2. device cannot determine busy state
170 		      before start of transmission (f.e. dialout)
171 		   3. device is buggy (ppp)
172 		 */
173 
174 requeue:
175 		if (skb->next)
176 			dev->gso_skb = skb;
177 		else
178 			q->ops->requeue(skb, q);
179 		netif_schedule(dev);
180 		return 1;
181 	}
182 	BUG_ON((int) q->q.qlen < 0);
183 	return q->q.qlen;
184 }
185 
186 void __qdisc_run(struct net_device *dev)
187 {
188 	if (unlikely(dev->qdisc == &noop_qdisc))
189 		goto out;
190 
191 	while (qdisc_restart(dev) < 0 && !netif_queue_stopped(dev))
192 		/* NOTHING */;
193 
194 out:
195 	clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
196 }
197 
198 static void dev_watchdog(unsigned long arg)
199 {
200 	struct net_device *dev = (struct net_device *)arg;
201 
202 	netif_tx_lock(dev);
203 	if (dev->qdisc != &noop_qdisc) {
204 		if (netif_device_present(dev) &&
205 		    netif_running(dev) &&
206 		    netif_carrier_ok(dev)) {
207 			if (netif_queue_stopped(dev) &&
208 			    time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
209 
210 				printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
211 				       dev->name);
212 				dev->tx_timeout(dev);
213 			}
214 			if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
215 				dev_hold(dev);
216 		}
217 	}
218 	netif_tx_unlock(dev);
219 
220 	dev_put(dev);
221 }
222 
223 static void dev_watchdog_init(struct net_device *dev)
224 {
225 	init_timer(&dev->watchdog_timer);
226 	dev->watchdog_timer.data = (unsigned long)dev;
227 	dev->watchdog_timer.function = dev_watchdog;
228 }
229 
230 void __netdev_watchdog_up(struct net_device *dev)
231 {
232 	if (dev->tx_timeout) {
233 		if (dev->watchdog_timeo <= 0)
234 			dev->watchdog_timeo = 5*HZ;
235 		if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
236 			dev_hold(dev);
237 	}
238 }
239 
240 static void dev_watchdog_up(struct net_device *dev)
241 {
242 	netif_tx_lock_bh(dev);
243 	__netdev_watchdog_up(dev);
244 	netif_tx_unlock_bh(dev);
245 }
246 
247 static void dev_watchdog_down(struct net_device *dev)
248 {
249 	netif_tx_lock_bh(dev);
250 	if (del_timer(&dev->watchdog_timer))
251 		dev_put(dev);
252 	netif_tx_unlock_bh(dev);
253 }
254 
255 void netif_carrier_on(struct net_device *dev)
256 {
257 	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
258 		linkwatch_fire_event(dev);
259 	if (netif_running(dev))
260 		__netdev_watchdog_up(dev);
261 }
262 
263 void netif_carrier_off(struct net_device *dev)
264 {
265 	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
266 		linkwatch_fire_event(dev);
267 }
268 
269 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
270    under all circumstances. It is difficult to invent anything faster or
271    cheaper.
272  */
273 
274 static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
275 {
276 	kfree_skb(skb);
277 	return NET_XMIT_CN;
278 }
279 
280 static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
281 {
282 	return NULL;
283 }
284 
285 static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
286 {
287 	if (net_ratelimit())
288 		printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
289 		       skb->dev->name);
290 	kfree_skb(skb);
291 	return NET_XMIT_CN;
292 }
293 
294 struct Qdisc_ops noop_qdisc_ops = {
295 	.id		=	"noop",
296 	.priv_size	=	0,
297 	.enqueue	=	noop_enqueue,
298 	.dequeue	=	noop_dequeue,
299 	.requeue	=	noop_requeue,
300 	.owner		=	THIS_MODULE,
301 };
302 
303 struct Qdisc noop_qdisc = {
304 	.enqueue	=	noop_enqueue,
305 	.dequeue	=	noop_dequeue,
306 	.flags		=	TCQ_F_BUILTIN,
307 	.ops		=	&noop_qdisc_ops,
308 	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
309 };
310 
311 static struct Qdisc_ops noqueue_qdisc_ops = {
312 	.id		=	"noqueue",
313 	.priv_size	=	0,
314 	.enqueue	=	noop_enqueue,
315 	.dequeue	=	noop_dequeue,
316 	.requeue	=	noop_requeue,
317 	.owner		=	THIS_MODULE,
318 };
319 
320 static struct Qdisc noqueue_qdisc = {
321 	.enqueue	=	NULL,
322 	.dequeue	=	noop_dequeue,
323 	.flags		=	TCQ_F_BUILTIN,
324 	.ops		=	&noqueue_qdisc_ops,
325 	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
326 };
327 
328 
329 static const u8 prio2band[TC_PRIO_MAX+1] =
330 	{ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
331 
332 /* 3-band FIFO queue: old style, but should be a bit faster than
333    generic prio+fifo combination.
334  */
335 
336 #define PFIFO_FAST_BANDS 3
337 
338 static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
339 					     struct Qdisc *qdisc)
340 {
341 	struct sk_buff_head *list = qdisc_priv(qdisc);
342 	return list + prio2band[skb->priority & TC_PRIO_MAX];
343 }
344 
345 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
346 {
347 	struct sk_buff_head *list = prio2list(skb, qdisc);
348 
349 	if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
350 		qdisc->q.qlen++;
351 		return __qdisc_enqueue_tail(skb, qdisc, list);
352 	}
353 
354 	return qdisc_drop(skb, qdisc);
355 }
356 
357 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
358 {
359 	int prio;
360 	struct sk_buff_head *list = qdisc_priv(qdisc);
361 
362 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
363 		if (!skb_queue_empty(list + prio)) {
364 			qdisc->q.qlen--;
365 			return __qdisc_dequeue_head(qdisc, list + prio);
366 		}
367 	}
368 
369 	return NULL;
370 }
371 
372 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
373 {
374 	qdisc->q.qlen++;
375 	return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
376 }
377 
378 static void pfifo_fast_reset(struct Qdisc* qdisc)
379 {
380 	int prio;
381 	struct sk_buff_head *list = qdisc_priv(qdisc);
382 
383 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
384 		__qdisc_reset_queue(qdisc, list + prio);
385 
386 	qdisc->qstats.backlog = 0;
387 	qdisc->q.qlen = 0;
388 }
389 
390 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
391 {
392 	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
393 
394 	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
395 	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
396 	return skb->len;
397 
398 rtattr_failure:
399 	return -1;
400 }
401 
402 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
403 {
404 	int prio;
405 	struct sk_buff_head *list = qdisc_priv(qdisc);
406 
407 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
408 		skb_queue_head_init(list + prio);
409 
410 	return 0;
411 }
412 
413 static struct Qdisc_ops pfifo_fast_ops = {
414 	.id		=	"pfifo_fast",
415 	.priv_size	=	PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
416 	.enqueue	=	pfifo_fast_enqueue,
417 	.dequeue	=	pfifo_fast_dequeue,
418 	.requeue	=	pfifo_fast_requeue,
419 	.init		=	pfifo_fast_init,
420 	.reset		=	pfifo_fast_reset,
421 	.dump		=	pfifo_fast_dump,
422 	.owner		=	THIS_MODULE,
423 };
424 
425 struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
426 {
427 	void *p;
428 	struct Qdisc *sch;
429 	unsigned int size;
430 	int err = -ENOBUFS;
431 
432 	/* ensure that the Qdisc and the private data are 32-byte aligned */
433 	size = QDISC_ALIGN(sizeof(*sch));
434 	size += ops->priv_size + (QDISC_ALIGNTO - 1);
435 
436 	p = kmalloc(size, GFP_KERNEL);
437 	if (!p)
438 		goto errout;
439 	memset(p, 0, size);
440 	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
441 	sch->padded = (char *) sch - (char *) p;
442 
443 	INIT_LIST_HEAD(&sch->list);
444 	skb_queue_head_init(&sch->q);
445 	sch->ops = ops;
446 	sch->enqueue = ops->enqueue;
447 	sch->dequeue = ops->dequeue;
448 	sch->dev = dev;
449 	dev_hold(dev);
450 	sch->stats_lock = &dev->queue_lock;
451 	atomic_set(&sch->refcnt, 1);
452 
453 	return sch;
454 errout:
455 	return ERR_PTR(-err);
456 }
457 
458 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
459 {
460 	struct Qdisc *sch;
461 
462 	sch = qdisc_alloc(dev, ops);
463 	if (IS_ERR(sch))
464 		goto errout;
465 
466 	if (!ops->init || ops->init(sch, NULL) == 0)
467 		return sch;
468 
469 	qdisc_destroy(sch);
470 errout:
471 	return NULL;
472 }
473 
474 /* Under dev->queue_lock and BH! */
475 
476 void qdisc_reset(struct Qdisc *qdisc)
477 {
478 	struct Qdisc_ops *ops = qdisc->ops;
479 
480 	if (ops->reset)
481 		ops->reset(qdisc);
482 }
483 
484 /* this is the rcu callback function to clean up a qdisc when there
485  * are no further references to it */
486 
487 static void __qdisc_destroy(struct rcu_head *head)
488 {
489 	struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
490 	struct Qdisc_ops  *ops = qdisc->ops;
491 
492 #ifdef CONFIG_NET_ESTIMATOR
493 	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
494 #endif
495 	write_lock(&qdisc_tree_lock);
496 	if (ops->reset)
497 		ops->reset(qdisc);
498 	if (ops->destroy)
499 		ops->destroy(qdisc);
500 	write_unlock(&qdisc_tree_lock);
501 	module_put(ops->owner);
502 
503 	dev_put(qdisc->dev);
504 	kfree((char *) qdisc - qdisc->padded);
505 }
506 
507 /* Under dev->queue_lock and BH! */
508 
509 void qdisc_destroy(struct Qdisc *qdisc)
510 {
511 	struct list_head cql = LIST_HEAD_INIT(cql);
512 	struct Qdisc *cq, *q, *n;
513 
514 	if (qdisc->flags & TCQ_F_BUILTIN ||
515 		!atomic_dec_and_test(&qdisc->refcnt))
516 		return;
517 
518 	if (!list_empty(&qdisc->list)) {
519 		if (qdisc->ops->cl_ops == NULL)
520 			list_del(&qdisc->list);
521 		else
522 			list_move(&qdisc->list, &cql);
523 	}
524 
525 	/* unlink inner qdiscs from dev->qdisc_list immediately */
526 	list_for_each_entry(cq, &cql, list)
527 		list_for_each_entry_safe(q, n, &qdisc->dev->qdisc_list, list)
528 			if (TC_H_MAJ(q->parent) == TC_H_MAJ(cq->handle)) {
529 				if (q->ops->cl_ops == NULL)
530 					list_del_init(&q->list);
531 				else
532 					list_move_tail(&q->list, &cql);
533 			}
534 	list_for_each_entry_safe(cq, n, &cql, list)
535 		list_del_init(&cq->list);
536 
537 	call_rcu(&qdisc->q_rcu, __qdisc_destroy);
538 }
539 
540 void dev_activate(struct net_device *dev)
541 {
542 	/* No queueing discipline is attached to device;
543 	   create default one i.e. pfifo_fast for devices,
544 	   which need queueing and noqueue_qdisc for
545 	   virtual interfaces
546 	 */
547 
548 	if (dev->qdisc_sleeping == &noop_qdisc) {
549 		struct Qdisc *qdisc;
550 		if (dev->tx_queue_len) {
551 			qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops);
552 			if (qdisc == NULL) {
553 				printk(KERN_INFO "%s: activation failed\n", dev->name);
554 				return;
555 			}
556 			write_lock_bh(&qdisc_tree_lock);
557 			list_add_tail(&qdisc->list, &dev->qdisc_list);
558 			write_unlock_bh(&qdisc_tree_lock);
559 		} else {
560 			qdisc =  &noqueue_qdisc;
561 		}
562 		write_lock_bh(&qdisc_tree_lock);
563 		dev->qdisc_sleeping = qdisc;
564 		write_unlock_bh(&qdisc_tree_lock);
565 	}
566 
567 	if (!netif_carrier_ok(dev))
568 		/* Delay activation until next carrier-on event */
569 		return;
570 
571 	spin_lock_bh(&dev->queue_lock);
572 	rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
573 	if (dev->qdisc != &noqueue_qdisc) {
574 		dev->trans_start = jiffies;
575 		dev_watchdog_up(dev);
576 	}
577 	spin_unlock_bh(&dev->queue_lock);
578 }
579 
580 void dev_deactivate(struct net_device *dev)
581 {
582 	struct Qdisc *qdisc;
583 
584 	spin_lock_bh(&dev->queue_lock);
585 	qdisc = dev->qdisc;
586 	dev->qdisc = &noop_qdisc;
587 
588 	qdisc_reset(qdisc);
589 
590 	spin_unlock_bh(&dev->queue_lock);
591 
592 	dev_watchdog_down(dev);
593 
594 	/* Wait for outstanding dev_queue_xmit calls. */
595 	synchronize_rcu();
596 
597 	/* Wait for outstanding qdisc_run calls. */
598 	while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
599 		yield();
600 
601 	if (dev->gso_skb) {
602 		kfree_skb(dev->gso_skb);
603 		dev->gso_skb = NULL;
604 	}
605 }
606 
607 void dev_init_scheduler(struct net_device *dev)
608 {
609 	qdisc_lock_tree(dev);
610 	dev->qdisc = &noop_qdisc;
611 	dev->qdisc_sleeping = &noop_qdisc;
612 	INIT_LIST_HEAD(&dev->qdisc_list);
613 	qdisc_unlock_tree(dev);
614 
615 	dev_watchdog_init(dev);
616 }
617 
618 void dev_shutdown(struct net_device *dev)
619 {
620 	struct Qdisc *qdisc;
621 
622 	qdisc_lock_tree(dev);
623 	qdisc = dev->qdisc_sleeping;
624 	dev->qdisc = &noop_qdisc;
625 	dev->qdisc_sleeping = &noop_qdisc;
626 	qdisc_destroy(qdisc);
627 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
628         if ((qdisc = dev->qdisc_ingress) != NULL) {
629 		dev->qdisc_ingress = NULL;
630 		qdisc_destroy(qdisc);
631         }
632 #endif
633 	BUG_TRAP(!timer_pending(&dev->watchdog_timer));
634 	qdisc_unlock_tree(dev);
635 }
636 
637 EXPORT_SYMBOL(__netdev_watchdog_up);
638 EXPORT_SYMBOL(netif_carrier_on);
639 EXPORT_SYMBOL(netif_carrier_off);
640 EXPORT_SYMBOL(noop_qdisc);
641 EXPORT_SYMBOL(noop_qdisc_ops);
642 EXPORT_SYMBOL(qdisc_create_dflt);
643 EXPORT_SYMBOL(qdisc_alloc);
644 EXPORT_SYMBOL(qdisc_destroy);
645 EXPORT_SYMBOL(qdisc_reset);
646 EXPORT_SYMBOL(qdisc_lock_tree);
647 EXPORT_SYMBOL(qdisc_unlock_tree);
648