xref: /linux/net/sched/sch_generic.c (revision f8343685643f2901fe11aa9d0358cafbeaf7b4c3)
1 /*
2  * net/sched/sch_generic.c	Generic packet scheduler routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11  *              - Ingress support
12  */
13 
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/module.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/init.h>
32 #include <linux/rcupdate.h>
33 #include <linux/list.h>
34 #include <net/sock.h>
35 #include <net/pkt_sched.h>
36 
37 /* Main transmission queue. */
38 
39 /* Modifications to data participating in scheduling must be protected with
40  * dev->queue_lock spinlock.
41  *
42  * The idea is the following:
43  * - enqueue, dequeue are serialized via top level device
44  *   spinlock dev->queue_lock.
45  * - ingress filtering is serialized via top level device
46  *   spinlock dev->ingress_lock.
47  * - updates to tree and tree walking are only done under the rtnl mutex.
48  */
49 
50 void qdisc_lock_tree(struct net_device *dev)
51 {
52 	spin_lock_bh(&dev->queue_lock);
53 	spin_lock(&dev->ingress_lock);
54 }
55 
56 void qdisc_unlock_tree(struct net_device *dev)
57 {
58 	spin_unlock(&dev->ingress_lock);
59 	spin_unlock_bh(&dev->queue_lock);
60 }
61 
62 /*
63    dev->queue_lock serializes queue accesses for this device
64    AND dev->qdisc pointer itself.
65 
66    netif_tx_lock serializes accesses to device driver.
67 
68    dev->queue_lock and netif_tx_lock are mutually exclusive,
69    if one is grabbed, another must be free.
70  */
71 
72 
73 /* Kick device.
74 
75    Returns:  0  - queue is empty or throttled.
76 	    >0  - queue is not empty.
77 
78    NOTE: Called under dev->queue_lock with locally disabled BH.
79 */
80 
81 static inline int qdisc_restart(struct net_device *dev)
82 {
83 	struct Qdisc *q = dev->qdisc;
84 	struct sk_buff *skb;
85 
86 	/* Dequeue packet */
87 	if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
88 		unsigned nolock = (dev->features & NETIF_F_LLTX);
89 
90 		dev->gso_skb = NULL;
91 
92 		/*
93 		 * When the driver has LLTX set it does its own locking
94 		 * in start_xmit. No need to add additional overhead by
95 		 * locking again. These checks are worth it because
96 		 * even uncongested locks can be quite expensive.
97 		 * The driver can do trylock like here too, in case
98 		 * of lock congestion it should return -1 and the packet
99 		 * will be requeued.
100 		 */
101 		if (!nolock) {
102 			if (!netif_tx_trylock(dev)) {
103 			collision:
104 				/* So, someone grabbed the driver. */
105 
106 				/* It may be transient configuration error,
107 				   when hard_start_xmit() recurses. We detect
108 				   it by checking xmit owner and drop the
109 				   packet when deadloop is detected.
110 				*/
111 				if (dev->xmit_lock_owner == smp_processor_id()) {
112 					kfree_skb(skb);
113 					if (net_ratelimit())
114 						printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
115 					goto out;
116 				}
117 				__get_cpu_var(netdev_rx_stat).cpu_collision++;
118 				goto requeue;
119 			}
120 		}
121 
122 		{
123 			/* And release queue */
124 			spin_unlock(&dev->queue_lock);
125 
126 			if (!netif_queue_stopped(dev)) {
127 				int ret;
128 
129 				ret = dev_hard_start_xmit(skb, dev);
130 				if (ret == NETDEV_TX_OK) {
131 					if (!nolock) {
132 						netif_tx_unlock(dev);
133 					}
134 					spin_lock(&dev->queue_lock);
135 					q = dev->qdisc;
136 					goto out;
137 				}
138 				if (ret == NETDEV_TX_LOCKED && nolock) {
139 					spin_lock(&dev->queue_lock);
140 					q = dev->qdisc;
141 					goto collision;
142 				}
143 			}
144 
145 			/* NETDEV_TX_BUSY - we need to requeue */
146 			/* Release the driver */
147 			if (!nolock) {
148 				netif_tx_unlock(dev);
149 			}
150 			spin_lock(&dev->queue_lock);
151 			q = dev->qdisc;
152 		}
153 
154 		/* Device kicked us out :(
155 		   This is possible in three cases:
156 
157 		   0. driver is locked
158 		   1. fastroute is enabled
159 		   2. device cannot determine busy state
160 		      before start of transmission (f.e. dialout)
161 		   3. device is buggy (ppp)
162 		 */
163 
164 requeue:
165 		if (unlikely(q == &noop_qdisc))
166 			kfree_skb(skb);
167 		else if (skb->next)
168 			dev->gso_skb = skb;
169 		else
170 			q->ops->requeue(skb, q);
171 		netif_schedule(dev);
172 	}
173 	return 0;
174 
175 out:
176 	BUG_ON((int) q->q.qlen < 0);
177 	return q->q.qlen;
178 }
179 
180 void __qdisc_run(struct net_device *dev)
181 {
182 	do {
183 		if (!qdisc_restart(dev))
184 			break;
185 	} while (!netif_queue_stopped(dev));
186 
187 	clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
188 }
189 
190 static void dev_watchdog(unsigned long arg)
191 {
192 	struct net_device *dev = (struct net_device *)arg;
193 
194 	netif_tx_lock(dev);
195 	if (dev->qdisc != &noop_qdisc) {
196 		if (netif_device_present(dev) &&
197 		    netif_running(dev) &&
198 		    netif_carrier_ok(dev)) {
199 			if (netif_queue_stopped(dev) &&
200 			    time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
201 
202 				printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
203 				       dev->name);
204 				dev->tx_timeout(dev);
205 			}
206 			if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + dev->watchdog_timeo)))
207 				dev_hold(dev);
208 		}
209 	}
210 	netif_tx_unlock(dev);
211 
212 	dev_put(dev);
213 }
214 
215 static void dev_watchdog_init(struct net_device *dev)
216 {
217 	init_timer(&dev->watchdog_timer);
218 	dev->watchdog_timer.data = (unsigned long)dev;
219 	dev->watchdog_timer.function = dev_watchdog;
220 }
221 
222 void __netdev_watchdog_up(struct net_device *dev)
223 {
224 	if (dev->tx_timeout) {
225 		if (dev->watchdog_timeo <= 0)
226 			dev->watchdog_timeo = 5*HZ;
227 		if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
228 			dev_hold(dev);
229 	}
230 }
231 
232 static void dev_watchdog_up(struct net_device *dev)
233 {
234 	__netdev_watchdog_up(dev);
235 }
236 
237 static void dev_watchdog_down(struct net_device *dev)
238 {
239 	netif_tx_lock_bh(dev);
240 	if (del_timer(&dev->watchdog_timer))
241 		dev_put(dev);
242 	netif_tx_unlock_bh(dev);
243 }
244 
245 void netif_carrier_on(struct net_device *dev)
246 {
247 	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
248 		linkwatch_fire_event(dev);
249 	if (netif_running(dev))
250 		__netdev_watchdog_up(dev);
251 }
252 
253 void netif_carrier_off(struct net_device *dev)
254 {
255 	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
256 		linkwatch_fire_event(dev);
257 }
258 
259 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
260    under all circumstances. It is difficult to invent anything faster or
261    cheaper.
262  */
263 
264 static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
265 {
266 	kfree_skb(skb);
267 	return NET_XMIT_CN;
268 }
269 
270 static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
271 {
272 	return NULL;
273 }
274 
275 static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
276 {
277 	if (net_ratelimit())
278 		printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
279 		       skb->dev->name);
280 	kfree_skb(skb);
281 	return NET_XMIT_CN;
282 }
283 
284 struct Qdisc_ops noop_qdisc_ops = {
285 	.id		=	"noop",
286 	.priv_size	=	0,
287 	.enqueue	=	noop_enqueue,
288 	.dequeue	=	noop_dequeue,
289 	.requeue	=	noop_requeue,
290 	.owner		=	THIS_MODULE,
291 };
292 
293 struct Qdisc noop_qdisc = {
294 	.enqueue	=	noop_enqueue,
295 	.dequeue	=	noop_dequeue,
296 	.flags		=	TCQ_F_BUILTIN,
297 	.ops		=	&noop_qdisc_ops,
298 	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
299 };
300 
301 static struct Qdisc_ops noqueue_qdisc_ops = {
302 	.id		=	"noqueue",
303 	.priv_size	=	0,
304 	.enqueue	=	noop_enqueue,
305 	.dequeue	=	noop_dequeue,
306 	.requeue	=	noop_requeue,
307 	.owner		=	THIS_MODULE,
308 };
309 
310 static struct Qdisc noqueue_qdisc = {
311 	.enqueue	=	NULL,
312 	.dequeue	=	noop_dequeue,
313 	.flags		=	TCQ_F_BUILTIN,
314 	.ops		=	&noqueue_qdisc_ops,
315 	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
316 };
317 
318 
319 static const u8 prio2band[TC_PRIO_MAX+1] =
320 	{ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
321 
322 /* 3-band FIFO queue: old style, but should be a bit faster than
323    generic prio+fifo combination.
324  */
325 
326 #define PFIFO_FAST_BANDS 3
327 
328 static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
329 					     struct Qdisc *qdisc)
330 {
331 	struct sk_buff_head *list = qdisc_priv(qdisc);
332 	return list + prio2band[skb->priority & TC_PRIO_MAX];
333 }
334 
335 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
336 {
337 	struct sk_buff_head *list = prio2list(skb, qdisc);
338 
339 	if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
340 		qdisc->q.qlen++;
341 		return __qdisc_enqueue_tail(skb, qdisc, list);
342 	}
343 
344 	return qdisc_drop(skb, qdisc);
345 }
346 
347 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
348 {
349 	int prio;
350 	struct sk_buff_head *list = qdisc_priv(qdisc);
351 
352 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
353 		if (!skb_queue_empty(list + prio)) {
354 			qdisc->q.qlen--;
355 			return __qdisc_dequeue_head(qdisc, list + prio);
356 		}
357 	}
358 
359 	return NULL;
360 }
361 
362 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
363 {
364 	qdisc->q.qlen++;
365 	return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
366 }
367 
368 static void pfifo_fast_reset(struct Qdisc* qdisc)
369 {
370 	int prio;
371 	struct sk_buff_head *list = qdisc_priv(qdisc);
372 
373 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
374 		__qdisc_reset_queue(qdisc, list + prio);
375 
376 	qdisc->qstats.backlog = 0;
377 	qdisc->q.qlen = 0;
378 }
379 
380 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
381 {
382 	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
383 
384 	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
385 	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
386 	return skb->len;
387 
388 rtattr_failure:
389 	return -1;
390 }
391 
392 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
393 {
394 	int prio;
395 	struct sk_buff_head *list = qdisc_priv(qdisc);
396 
397 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
398 		skb_queue_head_init(list + prio);
399 
400 	return 0;
401 }
402 
403 static struct Qdisc_ops pfifo_fast_ops = {
404 	.id		=	"pfifo_fast",
405 	.priv_size	=	PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
406 	.enqueue	=	pfifo_fast_enqueue,
407 	.dequeue	=	pfifo_fast_dequeue,
408 	.requeue	=	pfifo_fast_requeue,
409 	.init		=	pfifo_fast_init,
410 	.reset		=	pfifo_fast_reset,
411 	.dump		=	pfifo_fast_dump,
412 	.owner		=	THIS_MODULE,
413 };
414 
415 struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
416 {
417 	void *p;
418 	struct Qdisc *sch;
419 	unsigned int size;
420 	int err = -ENOBUFS;
421 
422 	/* ensure that the Qdisc and the private data are 32-byte aligned */
423 	size = QDISC_ALIGN(sizeof(*sch));
424 	size += ops->priv_size + (QDISC_ALIGNTO - 1);
425 
426 	p = kzalloc(size, GFP_KERNEL);
427 	if (!p)
428 		goto errout;
429 	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
430 	sch->padded = (char *) sch - (char *) p;
431 
432 	INIT_LIST_HEAD(&sch->list);
433 	skb_queue_head_init(&sch->q);
434 	sch->ops = ops;
435 	sch->enqueue = ops->enqueue;
436 	sch->dequeue = ops->dequeue;
437 	sch->dev = dev;
438 	dev_hold(dev);
439 	atomic_set(&sch->refcnt, 1);
440 
441 	return sch;
442 errout:
443 	return ERR_PTR(-err);
444 }
445 
446 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops,
447 				 unsigned int parentid)
448 {
449 	struct Qdisc *sch;
450 
451 	sch = qdisc_alloc(dev, ops);
452 	if (IS_ERR(sch))
453 		goto errout;
454 	sch->stats_lock = &dev->queue_lock;
455 	sch->parent = parentid;
456 
457 	if (!ops->init || ops->init(sch, NULL) == 0)
458 		return sch;
459 
460 	qdisc_destroy(sch);
461 errout:
462 	return NULL;
463 }
464 
465 /* Under dev->queue_lock and BH! */
466 
467 void qdisc_reset(struct Qdisc *qdisc)
468 {
469 	struct Qdisc_ops *ops = qdisc->ops;
470 
471 	if (ops->reset)
472 		ops->reset(qdisc);
473 }
474 
475 /* this is the rcu callback function to clean up a qdisc when there
476  * are no further references to it */
477 
478 static void __qdisc_destroy(struct rcu_head *head)
479 {
480 	struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
481 	kfree((char *) qdisc - qdisc->padded);
482 }
483 
484 /* Under dev->queue_lock and BH! */
485 
486 void qdisc_destroy(struct Qdisc *qdisc)
487 {
488 	struct Qdisc_ops  *ops = qdisc->ops;
489 
490 	if (qdisc->flags & TCQ_F_BUILTIN ||
491 	    !atomic_dec_and_test(&qdisc->refcnt))
492 		return;
493 
494 	list_del(&qdisc->list);
495 #ifdef CONFIG_NET_ESTIMATOR
496 	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
497 #endif
498 	if (ops->reset)
499 		ops->reset(qdisc);
500 	if (ops->destroy)
501 		ops->destroy(qdisc);
502 
503 	module_put(ops->owner);
504 	dev_put(qdisc->dev);
505 	call_rcu(&qdisc->q_rcu, __qdisc_destroy);
506 }
507 
508 void dev_activate(struct net_device *dev)
509 {
510 	/* No queueing discipline is attached to device;
511 	   create default one i.e. pfifo_fast for devices,
512 	   which need queueing and noqueue_qdisc for
513 	   virtual interfaces
514 	 */
515 
516 	if (dev->qdisc_sleeping == &noop_qdisc) {
517 		struct Qdisc *qdisc;
518 		if (dev->tx_queue_len) {
519 			qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops,
520 						  TC_H_ROOT);
521 			if (qdisc == NULL) {
522 				printk(KERN_INFO "%s: activation failed\n", dev->name);
523 				return;
524 			}
525 			list_add_tail(&qdisc->list, &dev->qdisc_list);
526 		} else {
527 			qdisc =  &noqueue_qdisc;
528 		}
529 		dev->qdisc_sleeping = qdisc;
530 	}
531 
532 	if (!netif_carrier_ok(dev))
533 		/* Delay activation until next carrier-on event */
534 		return;
535 
536 	spin_lock_bh(&dev->queue_lock);
537 	rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
538 	if (dev->qdisc != &noqueue_qdisc) {
539 		dev->trans_start = jiffies;
540 		dev_watchdog_up(dev);
541 	}
542 	spin_unlock_bh(&dev->queue_lock);
543 }
544 
545 void dev_deactivate(struct net_device *dev)
546 {
547 	struct Qdisc *qdisc;
548 	struct sk_buff *skb;
549 
550 	spin_lock_bh(&dev->queue_lock);
551 	qdisc = dev->qdisc;
552 	dev->qdisc = &noop_qdisc;
553 
554 	qdisc_reset(qdisc);
555 
556 	skb = dev->gso_skb;
557 	dev->gso_skb = NULL;
558 	spin_unlock_bh(&dev->queue_lock);
559 
560 	kfree_skb(skb);
561 
562 	dev_watchdog_down(dev);
563 
564 	/* Wait for outstanding dev_queue_xmit calls. */
565 	synchronize_rcu();
566 
567 	/* Wait for outstanding qdisc_run calls. */
568 	while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
569 		yield();
570 }
571 
572 void dev_init_scheduler(struct net_device *dev)
573 {
574 	qdisc_lock_tree(dev);
575 	dev->qdisc = &noop_qdisc;
576 	dev->qdisc_sleeping = &noop_qdisc;
577 	INIT_LIST_HEAD(&dev->qdisc_list);
578 	qdisc_unlock_tree(dev);
579 
580 	dev_watchdog_init(dev);
581 }
582 
583 void dev_shutdown(struct net_device *dev)
584 {
585 	struct Qdisc *qdisc;
586 
587 	qdisc_lock_tree(dev);
588 	qdisc = dev->qdisc_sleeping;
589 	dev->qdisc = &noop_qdisc;
590 	dev->qdisc_sleeping = &noop_qdisc;
591 	qdisc_destroy(qdisc);
592 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
593 	if ((qdisc = dev->qdisc_ingress) != NULL) {
594 		dev->qdisc_ingress = NULL;
595 		qdisc_destroy(qdisc);
596 	}
597 #endif
598 	BUG_TRAP(!timer_pending(&dev->watchdog_timer));
599 	qdisc_unlock_tree(dev);
600 }
601 
602 EXPORT_SYMBOL(netif_carrier_on);
603 EXPORT_SYMBOL(netif_carrier_off);
604 EXPORT_SYMBOL(noop_qdisc);
605 EXPORT_SYMBOL(qdisc_create_dflt);
606 EXPORT_SYMBOL(qdisc_destroy);
607 EXPORT_SYMBOL(qdisc_reset);
608 EXPORT_SYMBOL(qdisc_lock_tree);
609 EXPORT_SYMBOL(qdisc_unlock_tree);
610