xref: /linux/net/sched/sch_generic.c (revision 14b42963f64b98ab61fa9723c03d71aa5ef4f862)
1 /*
2  * net/sched/sch_generic.c	Generic packet scheduler routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11  *              - Ingress support
12  */
13 
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/module.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/init.h>
32 #include <linux/rcupdate.h>
33 #include <linux/list.h>
34 #include <net/sock.h>
35 #include <net/pkt_sched.h>
36 
37 /* Main transmission queue. */
38 
39 /* Main qdisc structure lock.
40 
41    However, modifications
42    to data, participating in scheduling must be additionally
43    protected with dev->queue_lock spinlock.
44 
45    The idea is the following:
46    - enqueue, dequeue are serialized via top level device
47      spinlock dev->queue_lock.
48    - tree walking is protected by read_lock_bh(qdisc_tree_lock)
49      and this lock is used only in process context.
50    - updates to tree are made under rtnl semaphore or
51      from softirq context (__qdisc_destroy rcu-callback)
52      hence this lock needs local bh disabling.
53 
54    qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
55  */
56 DEFINE_RWLOCK(qdisc_tree_lock);
57 
58 void qdisc_lock_tree(struct net_device *dev)
59 {
60 	write_lock_bh(&qdisc_tree_lock);
61 	spin_lock_bh(&dev->queue_lock);
62 }
63 
64 void qdisc_unlock_tree(struct net_device *dev)
65 {
66 	spin_unlock_bh(&dev->queue_lock);
67 	write_unlock_bh(&qdisc_tree_lock);
68 }
69 
70 /*
71    dev->queue_lock serializes queue accesses for this device
72    AND dev->qdisc pointer itself.
73 
74    netif_tx_lock serializes accesses to device driver.
75 
76    dev->queue_lock and netif_tx_lock are mutually exclusive,
77    if one is grabbed, another must be free.
78  */
79 
80 
81 /* Kick device.
82    Note, that this procedure can be called by a watchdog timer, so that
83    we do not check dev->tbusy flag here.
84 
85    Returns:  0  - queue is empty.
86             >0  - queue is not empty, but throttled.
87 	    <0  - queue is not empty. Device is throttled, if dev->tbusy != 0.
88 
89    NOTE: Called under dev->queue_lock with locally disabled BH.
90 */
91 
92 static inline int qdisc_restart(struct net_device *dev)
93 {
94 	struct Qdisc *q = dev->qdisc;
95 	struct sk_buff *skb;
96 
97 	/* Dequeue packet */
98 	if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
99 		unsigned nolock = (dev->features & NETIF_F_LLTX);
100 
101 		dev->gso_skb = NULL;
102 
103 		/*
104 		 * When the driver has LLTX set it does its own locking
105 		 * in start_xmit. No need to add additional overhead by
106 		 * locking again. These checks are worth it because
107 		 * even uncongested locks can be quite expensive.
108 		 * The driver can do trylock like here too, in case
109 		 * of lock congestion it should return -1 and the packet
110 		 * will be requeued.
111 		 */
112 		if (!nolock) {
113 			if (!netif_tx_trylock(dev)) {
114 			collision:
115 				/* So, someone grabbed the driver. */
116 
117 				/* It may be transient configuration error,
118 				   when hard_start_xmit() recurses. We detect
119 				   it by checking xmit owner and drop the
120 				   packet when deadloop is detected.
121 				*/
122 				if (dev->xmit_lock_owner == smp_processor_id()) {
123 					kfree_skb(skb);
124 					if (net_ratelimit())
125 						printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
126 					return -1;
127 				}
128 				__get_cpu_var(netdev_rx_stat).cpu_collision++;
129 				goto requeue;
130 			}
131 		}
132 
133 		{
134 			/* And release queue */
135 			spin_unlock(&dev->queue_lock);
136 
137 			if (!netif_queue_stopped(dev)) {
138 				int ret;
139 
140 				ret = dev_hard_start_xmit(skb, dev);
141 				if (ret == NETDEV_TX_OK) {
142 					if (!nolock) {
143 						netif_tx_unlock(dev);
144 					}
145 					spin_lock(&dev->queue_lock);
146 					return -1;
147 				}
148 				if (ret == NETDEV_TX_LOCKED && nolock) {
149 					spin_lock(&dev->queue_lock);
150 					goto collision;
151 				}
152 			}
153 
154 			/* NETDEV_TX_BUSY - we need to requeue */
155 			/* Release the driver */
156 			if (!nolock) {
157 				netif_tx_unlock(dev);
158 			}
159 			spin_lock(&dev->queue_lock);
160 			q = dev->qdisc;
161 		}
162 
163 		/* Device kicked us out :(
164 		   This is possible in three cases:
165 
166 		   0. driver is locked
167 		   1. fastroute is enabled
168 		   2. device cannot determine busy state
169 		      before start of transmission (f.e. dialout)
170 		   3. device is buggy (ppp)
171 		 */
172 
173 requeue:
174 		if (skb->next)
175 			dev->gso_skb = skb;
176 		else
177 			q->ops->requeue(skb, q);
178 		netif_schedule(dev);
179 		return 1;
180 	}
181 	BUG_ON((int) q->q.qlen < 0);
182 	return q->q.qlen;
183 }
184 
185 void __qdisc_run(struct net_device *dev)
186 {
187 	if (unlikely(dev->qdisc == &noop_qdisc))
188 		goto out;
189 
190 	while (qdisc_restart(dev) < 0 && !netif_queue_stopped(dev))
191 		/* NOTHING */;
192 
193 out:
194 	clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
195 }
196 
197 static void dev_watchdog(unsigned long arg)
198 {
199 	struct net_device *dev = (struct net_device *)arg;
200 
201 	netif_tx_lock(dev);
202 	if (dev->qdisc != &noop_qdisc) {
203 		if (netif_device_present(dev) &&
204 		    netif_running(dev) &&
205 		    netif_carrier_ok(dev)) {
206 			if (netif_queue_stopped(dev) &&
207 			    time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
208 
209 				printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
210 				       dev->name);
211 				dev->tx_timeout(dev);
212 			}
213 			if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
214 				dev_hold(dev);
215 		}
216 	}
217 	netif_tx_unlock(dev);
218 
219 	dev_put(dev);
220 }
221 
222 static void dev_watchdog_init(struct net_device *dev)
223 {
224 	init_timer(&dev->watchdog_timer);
225 	dev->watchdog_timer.data = (unsigned long)dev;
226 	dev->watchdog_timer.function = dev_watchdog;
227 }
228 
229 void __netdev_watchdog_up(struct net_device *dev)
230 {
231 	if (dev->tx_timeout) {
232 		if (dev->watchdog_timeo <= 0)
233 			dev->watchdog_timeo = 5*HZ;
234 		if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
235 			dev_hold(dev);
236 	}
237 }
238 
239 static void dev_watchdog_up(struct net_device *dev)
240 {
241 	netif_tx_lock_bh(dev);
242 	__netdev_watchdog_up(dev);
243 	netif_tx_unlock_bh(dev);
244 }
245 
246 static void dev_watchdog_down(struct net_device *dev)
247 {
248 	netif_tx_lock_bh(dev);
249 	if (del_timer(&dev->watchdog_timer))
250 		dev_put(dev);
251 	netif_tx_unlock_bh(dev);
252 }
253 
254 void netif_carrier_on(struct net_device *dev)
255 {
256 	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
257 		linkwatch_fire_event(dev);
258 	if (netif_running(dev))
259 		__netdev_watchdog_up(dev);
260 }
261 
262 void netif_carrier_off(struct net_device *dev)
263 {
264 	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
265 		linkwatch_fire_event(dev);
266 }
267 
268 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
269    under all circumstances. It is difficult to invent anything faster or
270    cheaper.
271  */
272 
273 static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
274 {
275 	kfree_skb(skb);
276 	return NET_XMIT_CN;
277 }
278 
279 static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
280 {
281 	return NULL;
282 }
283 
284 static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
285 {
286 	if (net_ratelimit())
287 		printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
288 		       skb->dev->name);
289 	kfree_skb(skb);
290 	return NET_XMIT_CN;
291 }
292 
293 struct Qdisc_ops noop_qdisc_ops = {
294 	.id		=	"noop",
295 	.priv_size	=	0,
296 	.enqueue	=	noop_enqueue,
297 	.dequeue	=	noop_dequeue,
298 	.requeue	=	noop_requeue,
299 	.owner		=	THIS_MODULE,
300 };
301 
302 struct Qdisc noop_qdisc = {
303 	.enqueue	=	noop_enqueue,
304 	.dequeue	=	noop_dequeue,
305 	.flags		=	TCQ_F_BUILTIN,
306 	.ops		=	&noop_qdisc_ops,
307 	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
308 };
309 
310 static struct Qdisc_ops noqueue_qdisc_ops = {
311 	.id		=	"noqueue",
312 	.priv_size	=	0,
313 	.enqueue	=	noop_enqueue,
314 	.dequeue	=	noop_dequeue,
315 	.requeue	=	noop_requeue,
316 	.owner		=	THIS_MODULE,
317 };
318 
319 static struct Qdisc noqueue_qdisc = {
320 	.enqueue	=	NULL,
321 	.dequeue	=	noop_dequeue,
322 	.flags		=	TCQ_F_BUILTIN,
323 	.ops		=	&noqueue_qdisc_ops,
324 	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
325 };
326 
327 
328 static const u8 prio2band[TC_PRIO_MAX+1] =
329 	{ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
330 
331 /* 3-band FIFO queue: old style, but should be a bit faster than
332    generic prio+fifo combination.
333  */
334 
335 #define PFIFO_FAST_BANDS 3
336 
337 static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
338 					     struct Qdisc *qdisc)
339 {
340 	struct sk_buff_head *list = qdisc_priv(qdisc);
341 	return list + prio2band[skb->priority & TC_PRIO_MAX];
342 }
343 
344 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
345 {
346 	struct sk_buff_head *list = prio2list(skb, qdisc);
347 
348 	if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
349 		qdisc->q.qlen++;
350 		return __qdisc_enqueue_tail(skb, qdisc, list);
351 	}
352 
353 	return qdisc_drop(skb, qdisc);
354 }
355 
356 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
357 {
358 	int prio;
359 	struct sk_buff_head *list = qdisc_priv(qdisc);
360 
361 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
362 		if (!skb_queue_empty(list + prio)) {
363 			qdisc->q.qlen--;
364 			return __qdisc_dequeue_head(qdisc, list + prio);
365 		}
366 	}
367 
368 	return NULL;
369 }
370 
371 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
372 {
373 	qdisc->q.qlen++;
374 	return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
375 }
376 
377 static void pfifo_fast_reset(struct Qdisc* qdisc)
378 {
379 	int prio;
380 	struct sk_buff_head *list = qdisc_priv(qdisc);
381 
382 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
383 		__qdisc_reset_queue(qdisc, list + prio);
384 
385 	qdisc->qstats.backlog = 0;
386 	qdisc->q.qlen = 0;
387 }
388 
389 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
390 {
391 	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
392 
393 	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
394 	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
395 	return skb->len;
396 
397 rtattr_failure:
398 	return -1;
399 }
400 
401 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
402 {
403 	int prio;
404 	struct sk_buff_head *list = qdisc_priv(qdisc);
405 
406 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
407 		skb_queue_head_init(list + prio);
408 
409 	return 0;
410 }
411 
412 static struct Qdisc_ops pfifo_fast_ops = {
413 	.id		=	"pfifo_fast",
414 	.priv_size	=	PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
415 	.enqueue	=	pfifo_fast_enqueue,
416 	.dequeue	=	pfifo_fast_dequeue,
417 	.requeue	=	pfifo_fast_requeue,
418 	.init		=	pfifo_fast_init,
419 	.reset		=	pfifo_fast_reset,
420 	.dump		=	pfifo_fast_dump,
421 	.owner		=	THIS_MODULE,
422 };
423 
424 struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
425 {
426 	void *p;
427 	struct Qdisc *sch;
428 	unsigned int size;
429 	int err = -ENOBUFS;
430 
431 	/* ensure that the Qdisc and the private data are 32-byte aligned */
432 	size = QDISC_ALIGN(sizeof(*sch));
433 	size += ops->priv_size + (QDISC_ALIGNTO - 1);
434 
435 	p = kmalloc(size, GFP_KERNEL);
436 	if (!p)
437 		goto errout;
438 	memset(p, 0, size);
439 	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
440 	sch->padded = (char *) sch - (char *) p;
441 
442 	INIT_LIST_HEAD(&sch->list);
443 	skb_queue_head_init(&sch->q);
444 	sch->ops = ops;
445 	sch->enqueue = ops->enqueue;
446 	sch->dequeue = ops->dequeue;
447 	sch->dev = dev;
448 	dev_hold(dev);
449 	sch->stats_lock = &dev->queue_lock;
450 	atomic_set(&sch->refcnt, 1);
451 
452 	return sch;
453 errout:
454 	return ERR_PTR(-err);
455 }
456 
457 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
458 {
459 	struct Qdisc *sch;
460 
461 	sch = qdisc_alloc(dev, ops);
462 	if (IS_ERR(sch))
463 		goto errout;
464 
465 	if (!ops->init || ops->init(sch, NULL) == 0)
466 		return sch;
467 
468 	qdisc_destroy(sch);
469 errout:
470 	return NULL;
471 }
472 
473 /* Under dev->queue_lock and BH! */
474 
475 void qdisc_reset(struct Qdisc *qdisc)
476 {
477 	struct Qdisc_ops *ops = qdisc->ops;
478 
479 	if (ops->reset)
480 		ops->reset(qdisc);
481 }
482 
483 /* this is the rcu callback function to clean up a qdisc when there
484  * are no further references to it */
485 
486 static void __qdisc_destroy(struct rcu_head *head)
487 {
488 	struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
489 	struct Qdisc_ops  *ops = qdisc->ops;
490 
491 #ifdef CONFIG_NET_ESTIMATOR
492 	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
493 #endif
494 	write_lock(&qdisc_tree_lock);
495 	if (ops->reset)
496 		ops->reset(qdisc);
497 	if (ops->destroy)
498 		ops->destroy(qdisc);
499 	write_unlock(&qdisc_tree_lock);
500 	module_put(ops->owner);
501 
502 	dev_put(qdisc->dev);
503 	kfree((char *) qdisc - qdisc->padded);
504 }
505 
506 /* Under dev->queue_lock and BH! */
507 
508 void qdisc_destroy(struct Qdisc *qdisc)
509 {
510 	struct list_head cql = LIST_HEAD_INIT(cql);
511 	struct Qdisc *cq, *q, *n;
512 
513 	if (qdisc->flags & TCQ_F_BUILTIN ||
514 		!atomic_dec_and_test(&qdisc->refcnt))
515 		return;
516 
517 	if (!list_empty(&qdisc->list)) {
518 		if (qdisc->ops->cl_ops == NULL)
519 			list_del(&qdisc->list);
520 		else
521 			list_move(&qdisc->list, &cql);
522 	}
523 
524 	/* unlink inner qdiscs from dev->qdisc_list immediately */
525 	list_for_each_entry(cq, &cql, list)
526 		list_for_each_entry_safe(q, n, &qdisc->dev->qdisc_list, list)
527 			if (TC_H_MAJ(q->parent) == TC_H_MAJ(cq->handle)) {
528 				if (q->ops->cl_ops == NULL)
529 					list_del_init(&q->list);
530 				else
531 					list_move_tail(&q->list, &cql);
532 			}
533 	list_for_each_entry_safe(cq, n, &cql, list)
534 		list_del_init(&cq->list);
535 
536 	call_rcu(&qdisc->q_rcu, __qdisc_destroy);
537 }
538 
539 void dev_activate(struct net_device *dev)
540 {
541 	/* No queueing discipline is attached to device;
542 	   create default one i.e. pfifo_fast for devices,
543 	   which need queueing and noqueue_qdisc for
544 	   virtual interfaces
545 	 */
546 
547 	if (dev->qdisc_sleeping == &noop_qdisc) {
548 		struct Qdisc *qdisc;
549 		if (dev->tx_queue_len) {
550 			qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops);
551 			if (qdisc == NULL) {
552 				printk(KERN_INFO "%s: activation failed\n", dev->name);
553 				return;
554 			}
555 			write_lock_bh(&qdisc_tree_lock);
556 			list_add_tail(&qdisc->list, &dev->qdisc_list);
557 			write_unlock_bh(&qdisc_tree_lock);
558 		} else {
559 			qdisc =  &noqueue_qdisc;
560 		}
561 		write_lock_bh(&qdisc_tree_lock);
562 		dev->qdisc_sleeping = qdisc;
563 		write_unlock_bh(&qdisc_tree_lock);
564 	}
565 
566 	if (!netif_carrier_ok(dev))
567 		/* Delay activation until next carrier-on event */
568 		return;
569 
570 	spin_lock_bh(&dev->queue_lock);
571 	rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
572 	if (dev->qdisc != &noqueue_qdisc) {
573 		dev->trans_start = jiffies;
574 		dev_watchdog_up(dev);
575 	}
576 	spin_unlock_bh(&dev->queue_lock);
577 }
578 
579 void dev_deactivate(struct net_device *dev)
580 {
581 	struct Qdisc *qdisc;
582 
583 	spin_lock_bh(&dev->queue_lock);
584 	qdisc = dev->qdisc;
585 	dev->qdisc = &noop_qdisc;
586 
587 	qdisc_reset(qdisc);
588 
589 	spin_unlock_bh(&dev->queue_lock);
590 
591 	dev_watchdog_down(dev);
592 
593 	/* Wait for outstanding dev_queue_xmit calls. */
594 	synchronize_rcu();
595 
596 	/* Wait for outstanding qdisc_run calls. */
597 	while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
598 		yield();
599 
600 	if (dev->gso_skb) {
601 		kfree_skb(dev->gso_skb);
602 		dev->gso_skb = NULL;
603 	}
604 }
605 
606 void dev_init_scheduler(struct net_device *dev)
607 {
608 	qdisc_lock_tree(dev);
609 	dev->qdisc = &noop_qdisc;
610 	dev->qdisc_sleeping = &noop_qdisc;
611 	INIT_LIST_HEAD(&dev->qdisc_list);
612 	qdisc_unlock_tree(dev);
613 
614 	dev_watchdog_init(dev);
615 }
616 
617 void dev_shutdown(struct net_device *dev)
618 {
619 	struct Qdisc *qdisc;
620 
621 	qdisc_lock_tree(dev);
622 	qdisc = dev->qdisc_sleeping;
623 	dev->qdisc = &noop_qdisc;
624 	dev->qdisc_sleeping = &noop_qdisc;
625 	qdisc_destroy(qdisc);
626 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
627         if ((qdisc = dev->qdisc_ingress) != NULL) {
628 		dev->qdisc_ingress = NULL;
629 		qdisc_destroy(qdisc);
630         }
631 #endif
632 	BUG_TRAP(!timer_pending(&dev->watchdog_timer));
633 	qdisc_unlock_tree(dev);
634 }
635 
636 EXPORT_SYMBOL(__netdev_watchdog_up);
637 EXPORT_SYMBOL(netif_carrier_on);
638 EXPORT_SYMBOL(netif_carrier_off);
639 EXPORT_SYMBOL(noop_qdisc);
640 EXPORT_SYMBOL(noop_qdisc_ops);
641 EXPORT_SYMBOL(qdisc_create_dflt);
642 EXPORT_SYMBOL(qdisc_alloc);
643 EXPORT_SYMBOL(qdisc_destroy);
644 EXPORT_SYMBOL(qdisc_reset);
645 EXPORT_SYMBOL(qdisc_lock_tree);
646 EXPORT_SYMBOL(qdisc_unlock_tree);
647