xref: /linux/net/sched/sch_generic.c (revision eb2bce7f5e7ac1ca6da434461217fadf3c688d2c)
1 /*
2  * net/sched/sch_generic.c	Generic packet scheduler routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11  *              - Ingress support
12  */
13 
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/module.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/init.h>
32 #include <linux/rcupdate.h>
33 #include <linux/list.h>
34 #include <net/sock.h>
35 #include <net/pkt_sched.h>
36 
37 /* Main transmission queue. */
38 
39 /* Modifications to data participating in scheduling must be protected with
40  * dev->queue_lock spinlock.
41  *
42  * The idea is the following:
43  * - enqueue, dequeue are serialized via top level device
44  *   spinlock dev->queue_lock.
45  * - ingress filtering is serialized via top level device
46  *   spinlock dev->ingress_lock.
47  * - updates to tree and tree walking are only done under the rtnl mutex.
48  */
49 
50 void qdisc_lock_tree(struct net_device *dev)
51 {
52 	spin_lock_bh(&dev->queue_lock);
53 	spin_lock(&dev->ingress_lock);
54 }
55 
56 void qdisc_unlock_tree(struct net_device *dev)
57 {
58 	spin_unlock(&dev->ingress_lock);
59 	spin_unlock_bh(&dev->queue_lock);
60 }
61 
62 /*
63    dev->queue_lock serializes queue accesses for this device
64    AND dev->qdisc pointer itself.
65 
66    netif_tx_lock serializes accesses to device driver.
67 
68    dev->queue_lock and netif_tx_lock are mutually exclusive,
69    if one is grabbed, another must be free.
70  */
71 
72 
73 /* Kick device.
74    Note, that this procedure can be called by a watchdog timer, so that
75    we do not check dev->tbusy flag here.
76 
77    Returns:  0  - queue is empty.
78 	    >0  - queue is not empty, but throttled.
79 	    <0  - queue is not empty. Device is throttled, if dev->tbusy != 0.
80 
81    NOTE: Called under dev->queue_lock with locally disabled BH.
82 */
83 
84 static inline int qdisc_restart(struct net_device *dev)
85 {
86 	struct Qdisc *q = dev->qdisc;
87 	struct sk_buff *skb;
88 
89 	/* Dequeue packet */
90 	if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
91 		unsigned nolock = (dev->features & NETIF_F_LLTX);
92 
93 		dev->gso_skb = NULL;
94 
95 		/*
96 		 * When the driver has LLTX set it does its own locking
97 		 * in start_xmit. No need to add additional overhead by
98 		 * locking again. These checks are worth it because
99 		 * even uncongested locks can be quite expensive.
100 		 * The driver can do trylock like here too, in case
101 		 * of lock congestion it should return -1 and the packet
102 		 * will be requeued.
103 		 */
104 		if (!nolock) {
105 			if (!netif_tx_trylock(dev)) {
106 			collision:
107 				/* So, someone grabbed the driver. */
108 
109 				/* It may be transient configuration error,
110 				   when hard_start_xmit() recurses. We detect
111 				   it by checking xmit owner and drop the
112 				   packet when deadloop is detected.
113 				*/
114 				if (dev->xmit_lock_owner == smp_processor_id()) {
115 					kfree_skb(skb);
116 					if (net_ratelimit())
117 						printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
118 					return -1;
119 				}
120 				__get_cpu_var(netdev_rx_stat).cpu_collision++;
121 				goto requeue;
122 			}
123 		}
124 
125 		{
126 			/* And release queue */
127 			spin_unlock(&dev->queue_lock);
128 
129 			if (!netif_queue_stopped(dev)) {
130 				int ret;
131 
132 				ret = dev_hard_start_xmit(skb, dev);
133 				if (ret == NETDEV_TX_OK) {
134 					if (!nolock) {
135 						netif_tx_unlock(dev);
136 					}
137 					spin_lock(&dev->queue_lock);
138 					return -1;
139 				}
140 				if (ret == NETDEV_TX_LOCKED && nolock) {
141 					spin_lock(&dev->queue_lock);
142 					goto collision;
143 				}
144 			}
145 
146 			/* NETDEV_TX_BUSY - we need to requeue */
147 			/* Release the driver */
148 			if (!nolock) {
149 				netif_tx_unlock(dev);
150 			}
151 			spin_lock(&dev->queue_lock);
152 			q = dev->qdisc;
153 		}
154 
155 		/* Device kicked us out :(
156 		   This is possible in three cases:
157 
158 		   0. driver is locked
159 		   1. fastroute is enabled
160 		   2. device cannot determine busy state
161 		      before start of transmission (f.e. dialout)
162 		   3. device is buggy (ppp)
163 		 */
164 
165 requeue:
166 		if (skb->next)
167 			dev->gso_skb = skb;
168 		else
169 			q->ops->requeue(skb, q);
170 		netif_schedule(dev);
171 		return 1;
172 	}
173 	BUG_ON((int) q->q.qlen < 0);
174 	return q->q.qlen;
175 }
176 
177 void __qdisc_run(struct net_device *dev)
178 {
179 	if (unlikely(dev->qdisc == &noop_qdisc))
180 		goto out;
181 
182 	while (qdisc_restart(dev) < 0 && !netif_queue_stopped(dev))
183 		/* NOTHING */;
184 
185 out:
186 	clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
187 }
188 
189 static void dev_watchdog(unsigned long arg)
190 {
191 	struct net_device *dev = (struct net_device *)arg;
192 
193 	netif_tx_lock(dev);
194 	if (dev->qdisc != &noop_qdisc) {
195 		if (netif_device_present(dev) &&
196 		    netif_running(dev) &&
197 		    netif_carrier_ok(dev)) {
198 			if (netif_queue_stopped(dev) &&
199 			    time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
200 
201 				printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
202 				       dev->name);
203 				dev->tx_timeout(dev);
204 			}
205 			if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + dev->watchdog_timeo)))
206 				dev_hold(dev);
207 		}
208 	}
209 	netif_tx_unlock(dev);
210 
211 	dev_put(dev);
212 }
213 
214 static void dev_watchdog_init(struct net_device *dev)
215 {
216 	init_timer(&dev->watchdog_timer);
217 	dev->watchdog_timer.data = (unsigned long)dev;
218 	dev->watchdog_timer.function = dev_watchdog;
219 }
220 
221 void __netdev_watchdog_up(struct net_device *dev)
222 {
223 	if (dev->tx_timeout) {
224 		if (dev->watchdog_timeo <= 0)
225 			dev->watchdog_timeo = 5*HZ;
226 		if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
227 			dev_hold(dev);
228 	}
229 }
230 
231 static void dev_watchdog_up(struct net_device *dev)
232 {
233 	__netdev_watchdog_up(dev);
234 }
235 
236 static void dev_watchdog_down(struct net_device *dev)
237 {
238 	netif_tx_lock_bh(dev);
239 	if (del_timer(&dev->watchdog_timer))
240 		dev_put(dev);
241 	netif_tx_unlock_bh(dev);
242 }
243 
244 void netif_carrier_on(struct net_device *dev)
245 {
246 	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
247 		linkwatch_fire_event(dev);
248 	if (netif_running(dev))
249 		__netdev_watchdog_up(dev);
250 }
251 
252 void netif_carrier_off(struct net_device *dev)
253 {
254 	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
255 		linkwatch_fire_event(dev);
256 }
257 
258 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
259    under all circumstances. It is difficult to invent anything faster or
260    cheaper.
261  */
262 
263 static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
264 {
265 	kfree_skb(skb);
266 	return NET_XMIT_CN;
267 }
268 
269 static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
270 {
271 	return NULL;
272 }
273 
274 static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
275 {
276 	if (net_ratelimit())
277 		printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
278 		       skb->dev->name);
279 	kfree_skb(skb);
280 	return NET_XMIT_CN;
281 }
282 
283 struct Qdisc_ops noop_qdisc_ops = {
284 	.id		=	"noop",
285 	.priv_size	=	0,
286 	.enqueue	=	noop_enqueue,
287 	.dequeue	=	noop_dequeue,
288 	.requeue	=	noop_requeue,
289 	.owner		=	THIS_MODULE,
290 };
291 
292 struct Qdisc noop_qdisc = {
293 	.enqueue	=	noop_enqueue,
294 	.dequeue	=	noop_dequeue,
295 	.flags		=	TCQ_F_BUILTIN,
296 	.ops		=	&noop_qdisc_ops,
297 	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
298 };
299 
300 static struct Qdisc_ops noqueue_qdisc_ops = {
301 	.id		=	"noqueue",
302 	.priv_size	=	0,
303 	.enqueue	=	noop_enqueue,
304 	.dequeue	=	noop_dequeue,
305 	.requeue	=	noop_requeue,
306 	.owner		=	THIS_MODULE,
307 };
308 
309 static struct Qdisc noqueue_qdisc = {
310 	.enqueue	=	NULL,
311 	.dequeue	=	noop_dequeue,
312 	.flags		=	TCQ_F_BUILTIN,
313 	.ops		=	&noqueue_qdisc_ops,
314 	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
315 };
316 
317 
318 static const u8 prio2band[TC_PRIO_MAX+1] =
319 	{ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
320 
321 /* 3-band FIFO queue: old style, but should be a bit faster than
322    generic prio+fifo combination.
323  */
324 
325 #define PFIFO_FAST_BANDS 3
326 
327 static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
328 					     struct Qdisc *qdisc)
329 {
330 	struct sk_buff_head *list = qdisc_priv(qdisc);
331 	return list + prio2band[skb->priority & TC_PRIO_MAX];
332 }
333 
334 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
335 {
336 	struct sk_buff_head *list = prio2list(skb, qdisc);
337 
338 	if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
339 		qdisc->q.qlen++;
340 		return __qdisc_enqueue_tail(skb, qdisc, list);
341 	}
342 
343 	return qdisc_drop(skb, qdisc);
344 }
345 
346 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
347 {
348 	int prio;
349 	struct sk_buff_head *list = qdisc_priv(qdisc);
350 
351 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
352 		if (!skb_queue_empty(list + prio)) {
353 			qdisc->q.qlen--;
354 			return __qdisc_dequeue_head(qdisc, list + prio);
355 		}
356 	}
357 
358 	return NULL;
359 }
360 
361 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
362 {
363 	qdisc->q.qlen++;
364 	return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
365 }
366 
367 static void pfifo_fast_reset(struct Qdisc* qdisc)
368 {
369 	int prio;
370 	struct sk_buff_head *list = qdisc_priv(qdisc);
371 
372 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
373 		__qdisc_reset_queue(qdisc, list + prio);
374 
375 	qdisc->qstats.backlog = 0;
376 	qdisc->q.qlen = 0;
377 }
378 
379 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
380 {
381 	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
382 
383 	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
384 	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
385 	return skb->len;
386 
387 rtattr_failure:
388 	return -1;
389 }
390 
391 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
392 {
393 	int prio;
394 	struct sk_buff_head *list = qdisc_priv(qdisc);
395 
396 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
397 		skb_queue_head_init(list + prio);
398 
399 	return 0;
400 }
401 
402 static struct Qdisc_ops pfifo_fast_ops = {
403 	.id		=	"pfifo_fast",
404 	.priv_size	=	PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
405 	.enqueue	=	pfifo_fast_enqueue,
406 	.dequeue	=	pfifo_fast_dequeue,
407 	.requeue	=	pfifo_fast_requeue,
408 	.init		=	pfifo_fast_init,
409 	.reset		=	pfifo_fast_reset,
410 	.dump		=	pfifo_fast_dump,
411 	.owner		=	THIS_MODULE,
412 };
413 
414 struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
415 {
416 	void *p;
417 	struct Qdisc *sch;
418 	unsigned int size;
419 	int err = -ENOBUFS;
420 
421 	/* ensure that the Qdisc and the private data are 32-byte aligned */
422 	size = QDISC_ALIGN(sizeof(*sch));
423 	size += ops->priv_size + (QDISC_ALIGNTO - 1);
424 
425 	p = kzalloc(size, GFP_KERNEL);
426 	if (!p)
427 		goto errout;
428 	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
429 	sch->padded = (char *) sch - (char *) p;
430 
431 	INIT_LIST_HEAD(&sch->list);
432 	skb_queue_head_init(&sch->q);
433 	sch->ops = ops;
434 	sch->enqueue = ops->enqueue;
435 	sch->dequeue = ops->dequeue;
436 	sch->dev = dev;
437 	dev_hold(dev);
438 	atomic_set(&sch->refcnt, 1);
439 
440 	return sch;
441 errout:
442 	return ERR_PTR(-err);
443 }
444 
445 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops,
446 				 unsigned int parentid)
447 {
448 	struct Qdisc *sch;
449 
450 	sch = qdisc_alloc(dev, ops);
451 	if (IS_ERR(sch))
452 		goto errout;
453 	sch->stats_lock = &dev->queue_lock;
454 	sch->parent = parentid;
455 
456 	if (!ops->init || ops->init(sch, NULL) == 0)
457 		return sch;
458 
459 	qdisc_destroy(sch);
460 errout:
461 	return NULL;
462 }
463 
464 /* Under dev->queue_lock and BH! */
465 
466 void qdisc_reset(struct Qdisc *qdisc)
467 {
468 	struct Qdisc_ops *ops = qdisc->ops;
469 
470 	if (ops->reset)
471 		ops->reset(qdisc);
472 }
473 
474 /* this is the rcu callback function to clean up a qdisc when there
475  * are no further references to it */
476 
477 static void __qdisc_destroy(struct rcu_head *head)
478 {
479 	struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
480 	kfree((char *) qdisc - qdisc->padded);
481 }
482 
483 /* Under dev->queue_lock and BH! */
484 
485 void qdisc_destroy(struct Qdisc *qdisc)
486 {
487 	struct Qdisc_ops  *ops = qdisc->ops;
488 
489 	if (qdisc->flags & TCQ_F_BUILTIN ||
490 	    !atomic_dec_and_test(&qdisc->refcnt))
491 		return;
492 
493 	list_del(&qdisc->list);
494 #ifdef CONFIG_NET_ESTIMATOR
495 	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
496 #endif
497 	if (ops->reset)
498 		ops->reset(qdisc);
499 	if (ops->destroy)
500 		ops->destroy(qdisc);
501 
502 	module_put(ops->owner);
503 	dev_put(qdisc->dev);
504 	call_rcu(&qdisc->q_rcu, __qdisc_destroy);
505 }
506 
507 void dev_activate(struct net_device *dev)
508 {
509 	/* No queueing discipline is attached to device;
510 	   create default one i.e. pfifo_fast for devices,
511 	   which need queueing and noqueue_qdisc for
512 	   virtual interfaces
513 	 */
514 
515 	if (dev->qdisc_sleeping == &noop_qdisc) {
516 		struct Qdisc *qdisc;
517 		if (dev->tx_queue_len) {
518 			qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops,
519 						  TC_H_ROOT);
520 			if (qdisc == NULL) {
521 				printk(KERN_INFO "%s: activation failed\n", dev->name);
522 				return;
523 			}
524 			list_add_tail(&qdisc->list, &dev->qdisc_list);
525 		} else {
526 			qdisc =  &noqueue_qdisc;
527 		}
528 		dev->qdisc_sleeping = qdisc;
529 	}
530 
531 	if (!netif_carrier_ok(dev))
532 		/* Delay activation until next carrier-on event */
533 		return;
534 
535 	spin_lock_bh(&dev->queue_lock);
536 	rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
537 	if (dev->qdisc != &noqueue_qdisc) {
538 		dev->trans_start = jiffies;
539 		dev_watchdog_up(dev);
540 	}
541 	spin_unlock_bh(&dev->queue_lock);
542 }
543 
544 void dev_deactivate(struct net_device *dev)
545 {
546 	struct Qdisc *qdisc;
547 
548 	spin_lock_bh(&dev->queue_lock);
549 	qdisc = dev->qdisc;
550 	dev->qdisc = &noop_qdisc;
551 
552 	qdisc_reset(qdisc);
553 
554 	spin_unlock_bh(&dev->queue_lock);
555 
556 	dev_watchdog_down(dev);
557 
558 	/* Wait for outstanding dev_queue_xmit calls. */
559 	synchronize_rcu();
560 
561 	/* Wait for outstanding qdisc_run calls. */
562 	while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
563 		yield();
564 
565 	if (dev->gso_skb) {
566 		kfree_skb(dev->gso_skb);
567 		dev->gso_skb = NULL;
568 	}
569 }
570 
571 void dev_init_scheduler(struct net_device *dev)
572 {
573 	qdisc_lock_tree(dev);
574 	dev->qdisc = &noop_qdisc;
575 	dev->qdisc_sleeping = &noop_qdisc;
576 	INIT_LIST_HEAD(&dev->qdisc_list);
577 	qdisc_unlock_tree(dev);
578 
579 	dev_watchdog_init(dev);
580 }
581 
582 void dev_shutdown(struct net_device *dev)
583 {
584 	struct Qdisc *qdisc;
585 
586 	qdisc_lock_tree(dev);
587 	qdisc = dev->qdisc_sleeping;
588 	dev->qdisc = &noop_qdisc;
589 	dev->qdisc_sleeping = &noop_qdisc;
590 	qdisc_destroy(qdisc);
591 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
592 	if ((qdisc = dev->qdisc_ingress) != NULL) {
593 		dev->qdisc_ingress = NULL;
594 		qdisc_destroy(qdisc);
595 	}
596 #endif
597 	BUG_TRAP(!timer_pending(&dev->watchdog_timer));
598 	qdisc_unlock_tree(dev);
599 }
600 
601 EXPORT_SYMBOL(netif_carrier_on);
602 EXPORT_SYMBOL(netif_carrier_off);
603 EXPORT_SYMBOL(noop_qdisc);
604 EXPORT_SYMBOL(qdisc_create_dflt);
605 EXPORT_SYMBOL(qdisc_destroy);
606 EXPORT_SYMBOL(qdisc_reset);
607 EXPORT_SYMBOL(qdisc_lock_tree);
608 EXPORT_SYMBOL(qdisc_unlock_tree);
609