xref: /linux/net/sched/sch_netem.c (revision 776cfebb430c7b22c208b1b17add97f354d97cab)
1 /*
2  * net/sched/sch_netem.c	Network emulator
3  *
4  * 		This program is free software; you can redistribute it and/or
5  * 		modify it under the terms of the GNU General Public License
6  * 		as published by the Free Software Foundation; either version
7  * 		2 of the License, or (at your option) any later version.
8  *
9  *  		Many of the algorithms and ideas for this came from
10  *		NIST Net which is not copyrighted.
11  *
12  * Authors:	Stephen Hemminger <shemminger@osdl.org>
13  *		Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
14  */
15 
16 #include <linux/config.h>
17 #include <linux/module.h>
18 #include <linux/bitops.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/errno.h>
22 #include <linux/netdevice.h>
23 #include <linux/skbuff.h>
24 #include <linux/rtnetlink.h>
25 
26 #include <net/pkt_sched.h>
27 
28 /*	Network Emulation Queuing algorithm.
29 	====================================
30 
31 	Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
32 		 Network Emulation Tool
33 		 [2] Luigi Rizzo, DummyNet for FreeBSD
34 
35 	 ----------------------------------------------------------------
36 
37 	 This started out as a simple way to delay outgoing packets to
38 	 test TCP but has grown to include most of the functionality
39 	 of a full blown network emulator like NISTnet. It can delay
40 	 packets and add random jitter (and correlation). The random
41 	 distribution can be loaded from a table as well to provide
42 	 normal, Pareto, or experimental curves. Packet loss,
43 	 duplication, and reordering can also be emulated.
44 
45 	 This qdisc does not do classification that can be handled in
46 	 layering other disciplines.  It does not need to do bandwidth
47 	 control either since that can be handled by using token
48 	 bucket or other rate control.
49 
50 	 The simulator is limited by the Linux timer resolution
51 	 and will create packet bursts on the HZ boundary (1ms).
52 */
53 
54 struct netem_sched_data {
55 	struct Qdisc	*qdisc;
56 	struct sk_buff_head delayed;
57 	struct timer_list timer;
58 
59 	u32 latency;
60 	u32 loss;
61 	u32 limit;
62 	u32 counter;
63 	u32 gap;
64 	u32 jitter;
65 	u32 duplicate;
66 
67 	struct crndstate {
68 		unsigned long last;
69 		unsigned long rho;
70 	} delay_cor, loss_cor, dup_cor;
71 
72 	struct disttable {
73 		u32  size;
74 		s16 table[0];
75 	} *delay_dist;
76 };
77 
78 /* Time stamp put into socket buffer control block */
79 struct netem_skb_cb {
80 	psched_time_t	time_to_send;
81 };
82 
83 /* init_crandom - initialize correlated random number generator
84  * Use entropy source for initial seed.
85  */
86 static void init_crandom(struct crndstate *state, unsigned long rho)
87 {
88 	state->rho = rho;
89 	state->last = net_random();
90 }
91 
92 /* get_crandom - correlated random number generator
93  * Next number depends on last value.
94  * rho is scaled to avoid floating point.
95  */
96 static unsigned long get_crandom(struct crndstate *state)
97 {
98 	u64 value, rho;
99 	unsigned long answer;
100 
101 	if (state->rho == 0)	/* no correllation */
102 		return net_random();
103 
104 	value = net_random();
105 	rho = (u64)state->rho + 1;
106 	answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
107 	state->last = answer;
108 	return answer;
109 }
110 
111 /* tabledist - return a pseudo-randomly distributed value with mean mu and
112  * std deviation sigma.  Uses table lookup to approximate the desired
113  * distribution, and a uniformly-distributed pseudo-random source.
114  */
115 static long tabledist(unsigned long mu, long sigma,
116 		      struct crndstate *state, const struct disttable *dist)
117 {
118 	long t, x;
119 	unsigned long rnd;
120 
121 	if (sigma == 0)
122 		return mu;
123 
124 	rnd = get_crandom(state);
125 
126 	/* default uniform distribution */
127 	if (dist == NULL)
128 		return (rnd % (2*sigma)) - sigma + mu;
129 
130 	t = dist->table[rnd % dist->size];
131 	x = (sigma % NETEM_DIST_SCALE) * t;
132 	if (x >= 0)
133 		x += NETEM_DIST_SCALE/2;
134 	else
135 		x -= NETEM_DIST_SCALE/2;
136 
137 	return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
138 }
139 
140 /* Put skb in the private delayed queue. */
141 static int netem_delay(struct Qdisc *sch, struct sk_buff *skb)
142 {
143 	struct netem_sched_data *q = qdisc_priv(sch);
144 	psched_tdiff_t td;
145 	psched_time_t now;
146 
147 	PSCHED_GET_TIME(now);
148 	td = tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist);
149 
150 	/* Always queue at tail to keep packets in order */
151 	if (likely(q->delayed.qlen < q->limit)) {
152 		struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb;
153 
154 		PSCHED_TADD2(now, td, cb->time_to_send);
155 
156 		pr_debug("netem_delay: skb=%p now=%llu tosend=%llu\n", skb,
157 			 now, cb->time_to_send);
158 
159 		__skb_queue_tail(&q->delayed, skb);
160 		return NET_XMIT_SUCCESS;
161 	}
162 
163 	pr_debug("netem_delay: queue over limit %d\n", q->limit);
164 	sch->qstats.overlimits++;
165 	kfree_skb(skb);
166 	return NET_XMIT_DROP;
167 }
168 
169 /*
170  *  Move a packet that is ready to send from the delay holding
171  *  list to the underlying qdisc.
172  */
173 static int netem_run(struct Qdisc *sch)
174 {
175 	struct netem_sched_data *q = qdisc_priv(sch);
176 	struct sk_buff *skb;
177 	psched_time_t now;
178 
179 	PSCHED_GET_TIME(now);
180 
181 	skb = skb_peek(&q->delayed);
182 	if (skb) {
183 		const struct netem_skb_cb *cb
184 			= (const struct netem_skb_cb *)skb->cb;
185 		long delay
186 			= PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now));
187 		pr_debug("netem_run: skb=%p delay=%ld\n", skb, delay);
188 
189 		/* if more time remaining? */
190 		if (delay > 0) {
191 			mod_timer(&q->timer, jiffies + delay);
192 			return 1;
193 		}
194 
195 		__skb_unlink(skb, &q->delayed);
196 
197 		if (q->qdisc->enqueue(skb, q->qdisc)) {
198 			sch->q.qlen--;
199 			sch->qstats.drops++;
200 		}
201 	}
202 
203 	return 0;
204 }
205 
206 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
207 {
208 	struct netem_sched_data *q = qdisc_priv(sch);
209 	int ret;
210 
211 	pr_debug("netem_enqueue skb=%p\n", skb);
212 
213 	/* Random packet drop 0 => none, ~0 => all */
214 	if (q->loss && q->loss >= get_crandom(&q->loss_cor)) {
215 		pr_debug("netem_enqueue: random loss\n");
216 		sch->qstats.drops++;
217 		kfree_skb(skb);
218 		return 0;	/* lie about loss so TCP doesn't know */
219 	}
220 
221 	/* Random duplication */
222 	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) {
223 		struct sk_buff *skb2;
224 
225 		skb2 = skb_clone(skb, GFP_ATOMIC);
226 		if (skb2 && netem_delay(sch, skb2) == NET_XMIT_SUCCESS) {
227 			struct Qdisc *qp;
228 
229 			/* Since one packet can generate two packets in the
230 			 * queue, the parent's qlen accounting gets confused,
231 			 * so fix it.
232 			 */
233 			qp = qdisc_lookup(sch->dev, TC_H_MAJ(sch->parent));
234 			if (qp)
235 				qp->q.qlen++;
236 
237 			sch->q.qlen++;
238 			sch->bstats.bytes += skb2->len;
239 			sch->bstats.packets++;
240 		} else
241 			sch->qstats.drops++;
242 	}
243 
244 	/* If doing simple delay then gap == 0 so all packets
245 	 * go into the delayed holding queue
246 	 * otherwise if doing out of order only "1 out of gap"
247 	 * packets will be delayed.
248 	 */
249 	if (q->counter < q->gap) {
250 		++q->counter;
251 		ret = q->qdisc->enqueue(skb, q->qdisc);
252 	} else {
253 		q->counter = 0;
254 		ret = netem_delay(sch, skb);
255 		netem_run(sch);
256 	}
257 
258 	if (likely(ret == NET_XMIT_SUCCESS)) {
259 		sch->q.qlen++;
260 		sch->bstats.bytes += skb->len;
261 		sch->bstats.packets++;
262 	} else
263 		sch->qstats.drops++;
264 
265 	pr_debug("netem: enqueue ret %d\n", ret);
266 	return ret;
267 }
268 
269 /* Requeue packets but don't change time stamp */
270 static int netem_requeue(struct sk_buff *skb, struct Qdisc *sch)
271 {
272 	struct netem_sched_data *q = qdisc_priv(sch);
273 	int ret;
274 
275 	if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) {
276 		sch->q.qlen++;
277 		sch->qstats.requeues++;
278 	}
279 
280 	return ret;
281 }
282 
283 static unsigned int netem_drop(struct Qdisc* sch)
284 {
285 	struct netem_sched_data *q = qdisc_priv(sch);
286 	unsigned int len;
287 
288 	if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) {
289 		sch->q.qlen--;
290 		sch->qstats.drops++;
291 	}
292 	return len;
293 }
294 
295 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
296 {
297 	struct netem_sched_data *q = qdisc_priv(sch);
298 	struct sk_buff *skb;
299 	int pending;
300 
301 	pending = netem_run(sch);
302 
303 	skb = q->qdisc->dequeue(q->qdisc);
304 	if (skb) {
305 		pr_debug("netem_dequeue: return skb=%p\n", skb);
306 		sch->q.qlen--;
307 		sch->flags &= ~TCQ_F_THROTTLED;
308 	}
309 	else if (pending) {
310 		pr_debug("netem_dequeue: throttling\n");
311 		sch->flags |= TCQ_F_THROTTLED;
312 	}
313 
314 	return skb;
315 }
316 
317 static void netem_watchdog(unsigned long arg)
318 {
319 	struct Qdisc *sch = (struct Qdisc *)arg;
320 
321 	pr_debug("netem_watchdog qlen=%d\n", sch->q.qlen);
322 	sch->flags &= ~TCQ_F_THROTTLED;
323 	netif_schedule(sch->dev);
324 }
325 
326 static void netem_reset(struct Qdisc *sch)
327 {
328 	struct netem_sched_data *q = qdisc_priv(sch);
329 
330 	qdisc_reset(q->qdisc);
331 	skb_queue_purge(&q->delayed);
332 
333 	sch->q.qlen = 0;
334 	sch->flags &= ~TCQ_F_THROTTLED;
335 	del_timer_sync(&q->timer);
336 }
337 
338 static int set_fifo_limit(struct Qdisc *q, int limit)
339 {
340         struct rtattr *rta;
341 	int ret = -ENOMEM;
342 
343 	rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
344 	if (rta) {
345 		rta->rta_type = RTM_NEWQDISC;
346 		rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt));
347 		((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit;
348 
349 		ret = q->ops->change(q, rta);
350 		kfree(rta);
351 	}
352 	return ret;
353 }
354 
355 /*
356  * Distribution data is a variable size payload containing
357  * signed 16 bit values.
358  */
359 static int get_dist_table(struct Qdisc *sch, const struct rtattr *attr)
360 {
361 	struct netem_sched_data *q = qdisc_priv(sch);
362 	unsigned long n = RTA_PAYLOAD(attr)/sizeof(__s16);
363 	const __s16 *data = RTA_DATA(attr);
364 	struct disttable *d;
365 	int i;
366 
367 	if (n > 65536)
368 		return -EINVAL;
369 
370 	d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL);
371 	if (!d)
372 		return -ENOMEM;
373 
374 	d->size = n;
375 	for (i = 0; i < n; i++)
376 		d->table[i] = data[i];
377 
378 	spin_lock_bh(&sch->dev->queue_lock);
379 	d = xchg(&q->delay_dist, d);
380 	spin_unlock_bh(&sch->dev->queue_lock);
381 
382 	kfree(d);
383 	return 0;
384 }
385 
386 static int get_correlation(struct Qdisc *sch, const struct rtattr *attr)
387 {
388 	struct netem_sched_data *q = qdisc_priv(sch);
389 	const struct tc_netem_corr *c = RTA_DATA(attr);
390 
391 	if (RTA_PAYLOAD(attr) != sizeof(*c))
392 		return -EINVAL;
393 
394 	init_crandom(&q->delay_cor, c->delay_corr);
395 	init_crandom(&q->loss_cor, c->loss_corr);
396 	init_crandom(&q->dup_cor, c->dup_corr);
397 	return 0;
398 }
399 
400 static int netem_change(struct Qdisc *sch, struct rtattr *opt)
401 {
402 	struct netem_sched_data *q = qdisc_priv(sch);
403 	struct tc_netem_qopt *qopt;
404 	int ret;
405 
406 	if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt))
407 		return -EINVAL;
408 
409 	qopt = RTA_DATA(opt);
410 	ret = set_fifo_limit(q->qdisc, qopt->limit);
411 	if (ret) {
412 		pr_debug("netem: can't set fifo limit\n");
413 		return ret;
414 	}
415 
416 	q->latency = qopt->latency;
417 	q->jitter = qopt->jitter;
418 	q->limit = qopt->limit;
419 	q->gap = qopt->gap;
420 	q->loss = qopt->loss;
421 	q->duplicate = qopt->duplicate;
422 
423 	/* Handle nested options after initial queue options.
424 	 * Should have put all options in nested format but too late now.
425 	 */
426 	if (RTA_PAYLOAD(opt) > sizeof(*qopt)) {
427 		struct rtattr *tb[TCA_NETEM_MAX];
428 		if (rtattr_parse(tb, TCA_NETEM_MAX,
429 				 RTA_DATA(opt) + sizeof(*qopt),
430 				 RTA_PAYLOAD(opt) - sizeof(*qopt)))
431 			return -EINVAL;
432 
433 		if (tb[TCA_NETEM_CORR-1]) {
434 			ret = get_correlation(sch, tb[TCA_NETEM_CORR-1]);
435 			if (ret)
436 				return ret;
437 		}
438 
439 		if (tb[TCA_NETEM_DELAY_DIST-1]) {
440 			ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST-1]);
441 			if (ret)
442 				return ret;
443 		}
444 	}
445 
446 
447 	return 0;
448 }
449 
450 static int netem_init(struct Qdisc *sch, struct rtattr *opt)
451 {
452 	struct netem_sched_data *q = qdisc_priv(sch);
453 	int ret;
454 
455 	if (!opt)
456 		return -EINVAL;
457 
458 	skb_queue_head_init(&q->delayed);
459 	init_timer(&q->timer);
460 	q->timer.function = netem_watchdog;
461 	q->timer.data = (unsigned long) sch;
462 	q->counter = 0;
463 
464 	q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
465 	if (!q->qdisc) {
466 		pr_debug("netem: qdisc create failed\n");
467 		return -ENOMEM;
468 	}
469 
470 	ret = netem_change(sch, opt);
471 	if (ret) {
472 		pr_debug("netem: change failed\n");
473 		qdisc_destroy(q->qdisc);
474 	}
475 	return ret;
476 }
477 
478 static void netem_destroy(struct Qdisc *sch)
479 {
480 	struct netem_sched_data *q = qdisc_priv(sch);
481 
482 	del_timer_sync(&q->timer);
483 	qdisc_destroy(q->qdisc);
484 	kfree(q->delay_dist);
485 }
486 
487 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
488 {
489 	const struct netem_sched_data *q = qdisc_priv(sch);
490 	unsigned char	 *b = skb->tail;
491 	struct rtattr *rta = (struct rtattr *) b;
492 	struct tc_netem_qopt qopt;
493 	struct tc_netem_corr cor;
494 
495 	qopt.latency = q->latency;
496 	qopt.jitter = q->jitter;
497 	qopt.limit = q->limit;
498 	qopt.loss = q->loss;
499 	qopt.gap = q->gap;
500 	qopt.duplicate = q->duplicate;
501 	RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
502 
503 	cor.delay_corr = q->delay_cor.rho;
504 	cor.loss_corr = q->loss_cor.rho;
505 	cor.dup_corr = q->dup_cor.rho;
506 	RTA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
507 	rta->rta_len = skb->tail - b;
508 
509 	return skb->len;
510 
511 rtattr_failure:
512 	skb_trim(skb, b - skb->data);
513 	return -1;
514 }
515 
516 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
517 			  struct sk_buff *skb, struct tcmsg *tcm)
518 {
519 	struct netem_sched_data *q = qdisc_priv(sch);
520 
521 	if (cl != 1) 	/* only one class */
522 		return -ENOENT;
523 
524 	tcm->tcm_handle |= TC_H_MIN(1);
525 	tcm->tcm_info = q->qdisc->handle;
526 
527 	return 0;
528 }
529 
530 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
531 		     struct Qdisc **old)
532 {
533 	struct netem_sched_data *q = qdisc_priv(sch);
534 
535 	if (new == NULL)
536 		new = &noop_qdisc;
537 
538 	sch_tree_lock(sch);
539 	*old = xchg(&q->qdisc, new);
540 	qdisc_reset(*old);
541 	sch->q.qlen = 0;
542 	sch_tree_unlock(sch);
543 
544 	return 0;
545 }
546 
547 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
548 {
549 	struct netem_sched_data *q = qdisc_priv(sch);
550 	return q->qdisc;
551 }
552 
553 static unsigned long netem_get(struct Qdisc *sch, u32 classid)
554 {
555 	return 1;
556 }
557 
558 static void netem_put(struct Qdisc *sch, unsigned long arg)
559 {
560 }
561 
562 static int netem_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
563 			    struct rtattr **tca, unsigned long *arg)
564 {
565 	return -ENOSYS;
566 }
567 
568 static int netem_delete(struct Qdisc *sch, unsigned long arg)
569 {
570 	return -ENOSYS;
571 }
572 
573 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
574 {
575 	if (!walker->stop) {
576 		if (walker->count >= walker->skip)
577 			if (walker->fn(sch, 1, walker) < 0) {
578 				walker->stop = 1;
579 				return;
580 			}
581 		walker->count++;
582 	}
583 }
584 
585 static struct tcf_proto **netem_find_tcf(struct Qdisc *sch, unsigned long cl)
586 {
587 	return NULL;
588 }
589 
590 static struct Qdisc_class_ops netem_class_ops = {
591 	.graft		=	netem_graft,
592 	.leaf		=	netem_leaf,
593 	.get		=	netem_get,
594 	.put		=	netem_put,
595 	.change		=	netem_change_class,
596 	.delete		=	netem_delete,
597 	.walk		=	netem_walk,
598 	.tcf_chain	=	netem_find_tcf,
599 	.dump		=	netem_dump_class,
600 };
601 
602 static struct Qdisc_ops netem_qdisc_ops = {
603 	.id		=	"netem",
604 	.cl_ops		=	&netem_class_ops,
605 	.priv_size	=	sizeof(struct netem_sched_data),
606 	.enqueue	=	netem_enqueue,
607 	.dequeue	=	netem_dequeue,
608 	.requeue	=	netem_requeue,
609 	.drop		=	netem_drop,
610 	.init		=	netem_init,
611 	.reset		=	netem_reset,
612 	.destroy	=	netem_destroy,
613 	.change		=	netem_change,
614 	.dump		=	netem_dump,
615 	.owner		=	THIS_MODULE,
616 };
617 
618 
619 static int __init netem_module_init(void)
620 {
621 	return register_qdisc(&netem_qdisc_ops);
622 }
623 static void __exit netem_module_exit(void)
624 {
625 	unregister_qdisc(&netem_qdisc_ops);
626 }
627 module_init(netem_module_init)
628 module_exit(netem_module_exit)
629 MODULE_LICENSE("GPL");
630