xref: /linux/net/sched/sch_netem.c (revision 9e8ba5f3ec35cba4fd8a8bebda548c4db2651e40)
1 /*
2  * net/sched/sch_netem.c	Network emulator
3  *
4  * 		This program is free software; you can redistribute it and/or
5  * 		modify it under the terms of the GNU General Public License
6  * 		as published by the Free Software Foundation; either version
7  * 		2 of the License.
8  *
9  *  		Many of the algorithms and ideas for this came from
10  *		NIST Net which is not copyrighted.
11  *
12  * Authors:	Stephen Hemminger <shemminger@osdl.org>
13  *		Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
14  */
15 
16 #include <linux/mm.h>
17 #include <linux/module.h>
18 #include <linux/slab.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/errno.h>
22 #include <linux/skbuff.h>
23 #include <linux/vmalloc.h>
24 #include <linux/rtnetlink.h>
25 #include <linux/reciprocal_div.h>
26 
27 #include <net/netlink.h>
28 #include <net/pkt_sched.h>
29 
30 #define VERSION "1.3"
31 
32 /*	Network Emulation Queuing algorithm.
33 	====================================
34 
35 	Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
36 		 Network Emulation Tool
37 		 [2] Luigi Rizzo, DummyNet for FreeBSD
38 
39 	 ----------------------------------------------------------------
40 
41 	 This started out as a simple way to delay outgoing packets to
42 	 test TCP but has grown to include most of the functionality
43 	 of a full blown network emulator like NISTnet. It can delay
44 	 packets and add random jitter (and correlation). The random
45 	 distribution can be loaded from a table as well to provide
46 	 normal, Pareto, or experimental curves. Packet loss,
47 	 duplication, and reordering can also be emulated.
48 
49 	 This qdisc does not do classification that can be handled in
50 	 layering other disciplines.  It does not need to do bandwidth
51 	 control either since that can be handled by using token
52 	 bucket or other rate control.
53 
54      Correlated Loss Generator models
55 
56 	Added generation of correlated loss according to the
57 	"Gilbert-Elliot" model, a 4-state markov model.
58 
59 	References:
60 	[1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
61 	[2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
62 	and intuitive loss model for packet networks and its implementation
63 	in the Netem module in the Linux kernel", available in [1]
64 
65 	Authors: Stefano Salsano <stefano.salsano at uniroma2.it
66 		 Fabio Ludovici <fabio.ludovici at yahoo.it>
67 */
68 
69 struct netem_sched_data {
70 	struct Qdisc	*qdisc;
71 	struct qdisc_watchdog watchdog;
72 
73 	psched_tdiff_t latency;
74 	psched_tdiff_t jitter;
75 
76 	u32 loss;
77 	u32 limit;
78 	u32 counter;
79 	u32 gap;
80 	u32 duplicate;
81 	u32 reorder;
82 	u32 corrupt;
83 	u32 rate;
84 	s32 packet_overhead;
85 	u32 cell_size;
86 	u32 cell_size_reciprocal;
87 	s32 cell_overhead;
88 
89 	struct crndstate {
90 		u32 last;
91 		u32 rho;
92 	} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
93 
94 	struct disttable {
95 		u32  size;
96 		s16 table[0];
97 	} *delay_dist;
98 
99 	enum  {
100 		CLG_RANDOM,
101 		CLG_4_STATES,
102 		CLG_GILB_ELL,
103 	} loss_model;
104 
105 	/* Correlated Loss Generation models */
106 	struct clgstate {
107 		/* state of the Markov chain */
108 		u8 state;
109 
110 		/* 4-states and Gilbert-Elliot models */
111 		u32 a1;	/* p13 for 4-states or p for GE */
112 		u32 a2;	/* p31 for 4-states or r for GE */
113 		u32 a3;	/* p32 for 4-states or h for GE */
114 		u32 a4;	/* p14 for 4-states or 1-k for GE */
115 		u32 a5; /* p23 used only in 4-states */
116 	} clg;
117 
118 };
119 
120 /* Time stamp put into socket buffer control block */
121 struct netem_skb_cb {
122 	psched_time_t	time_to_send;
123 };
124 
125 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
126 {
127 	BUILD_BUG_ON(sizeof(skb->cb) <
128 		sizeof(struct qdisc_skb_cb) + sizeof(struct netem_skb_cb));
129 	return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
130 }
131 
132 /* init_crandom - initialize correlated random number generator
133  * Use entropy source for initial seed.
134  */
135 static void init_crandom(struct crndstate *state, unsigned long rho)
136 {
137 	state->rho = rho;
138 	state->last = net_random();
139 }
140 
141 /* get_crandom - correlated random number generator
142  * Next number depends on last value.
143  * rho is scaled to avoid floating point.
144  */
145 static u32 get_crandom(struct crndstate *state)
146 {
147 	u64 value, rho;
148 	unsigned long answer;
149 
150 	if (state->rho == 0)	/* no correlation */
151 		return net_random();
152 
153 	value = net_random();
154 	rho = (u64)state->rho + 1;
155 	answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
156 	state->last = answer;
157 	return answer;
158 }
159 
160 /* loss_4state - 4-state model loss generator
161  * Generates losses according to the 4-state Markov chain adopted in
162  * the GI (General and Intuitive) loss model.
163  */
164 static bool loss_4state(struct netem_sched_data *q)
165 {
166 	struct clgstate *clg = &q->clg;
167 	u32 rnd = net_random();
168 
169 	/*
170 	 * Makes a comparison between rnd and the transition
171 	 * probabilities outgoing from the current state, then decides the
172 	 * next state and if the next packet has to be transmitted or lost.
173 	 * The four states correspond to:
174 	 *   1 => successfully transmitted packets within a gap period
175 	 *   4 => isolated losses within a gap period
176 	 *   3 => lost packets within a burst period
177 	 *   2 => successfully transmitted packets within a burst period
178 	 */
179 	switch (clg->state) {
180 	case 1:
181 		if (rnd < clg->a4) {
182 			clg->state = 4;
183 			return true;
184 		} else if (clg->a4 < rnd && rnd < clg->a1) {
185 			clg->state = 3;
186 			return true;
187 		} else if (clg->a1 < rnd)
188 			clg->state = 1;
189 
190 		break;
191 	case 2:
192 		if (rnd < clg->a5) {
193 			clg->state = 3;
194 			return true;
195 		} else
196 			clg->state = 2;
197 
198 		break;
199 	case 3:
200 		if (rnd < clg->a3)
201 			clg->state = 2;
202 		else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
203 			clg->state = 1;
204 			return true;
205 		} else if (clg->a2 + clg->a3 < rnd) {
206 			clg->state = 3;
207 			return true;
208 		}
209 		break;
210 	case 4:
211 		clg->state = 1;
212 		break;
213 	}
214 
215 	return false;
216 }
217 
218 /* loss_gilb_ell - Gilbert-Elliot model loss generator
219  * Generates losses according to the Gilbert-Elliot loss model or
220  * its special cases  (Gilbert or Simple Gilbert)
221  *
222  * Makes a comparison between random number and the transition
223  * probabilities outgoing from the current state, then decides the
224  * next state. A second random number is extracted and the comparison
225  * with the loss probability of the current state decides if the next
226  * packet will be transmitted or lost.
227  */
228 static bool loss_gilb_ell(struct netem_sched_data *q)
229 {
230 	struct clgstate *clg = &q->clg;
231 
232 	switch (clg->state) {
233 	case 1:
234 		if (net_random() < clg->a1)
235 			clg->state = 2;
236 		if (net_random() < clg->a4)
237 			return true;
238 	case 2:
239 		if (net_random() < clg->a2)
240 			clg->state = 1;
241 		if (clg->a3 > net_random())
242 			return true;
243 	}
244 
245 	return false;
246 }
247 
248 static bool loss_event(struct netem_sched_data *q)
249 {
250 	switch (q->loss_model) {
251 	case CLG_RANDOM:
252 		/* Random packet drop 0 => none, ~0 => all */
253 		return q->loss && q->loss >= get_crandom(&q->loss_cor);
254 
255 	case CLG_4_STATES:
256 		/* 4state loss model algorithm (used also for GI model)
257 		* Extracts a value from the markov 4 state loss generator,
258 		* if it is 1 drops a packet and if needed writes the event in
259 		* the kernel logs
260 		*/
261 		return loss_4state(q);
262 
263 	case CLG_GILB_ELL:
264 		/* Gilbert-Elliot loss model algorithm
265 		* Extracts a value from the Gilbert-Elliot loss generator,
266 		* if it is 1 drops a packet and if needed writes the event in
267 		* the kernel logs
268 		*/
269 		return loss_gilb_ell(q);
270 	}
271 
272 	return false;	/* not reached */
273 }
274 
275 
276 /* tabledist - return a pseudo-randomly distributed value with mean mu and
277  * std deviation sigma.  Uses table lookup to approximate the desired
278  * distribution, and a uniformly-distributed pseudo-random source.
279  */
280 static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
281 				struct crndstate *state,
282 				const struct disttable *dist)
283 {
284 	psched_tdiff_t x;
285 	long t;
286 	u32 rnd;
287 
288 	if (sigma == 0)
289 		return mu;
290 
291 	rnd = get_crandom(state);
292 
293 	/* default uniform distribution */
294 	if (dist == NULL)
295 		return (rnd % (2*sigma)) - sigma + mu;
296 
297 	t = dist->table[rnd % dist->size];
298 	x = (sigma % NETEM_DIST_SCALE) * t;
299 	if (x >= 0)
300 		x += NETEM_DIST_SCALE/2;
301 	else
302 		x -= NETEM_DIST_SCALE/2;
303 
304 	return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
305 }
306 
307 static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q)
308 {
309 	u64 ticks;
310 
311 	len += q->packet_overhead;
312 
313 	if (q->cell_size) {
314 		u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
315 
316 		if (len > cells * q->cell_size)	/* extra cell needed for remainder */
317 			cells++;
318 		len = cells * (q->cell_size + q->cell_overhead);
319 	}
320 
321 	ticks = (u64)len * NSEC_PER_SEC;
322 
323 	do_div(ticks, q->rate);
324 	return PSCHED_NS2TICKS(ticks);
325 }
326 
327 /*
328  * Insert one skb into qdisc.
329  * Note: parent depends on return value to account for queue length.
330  * 	NET_XMIT_DROP: queue length didn't change.
331  *      NET_XMIT_SUCCESS: one skb was queued.
332  */
333 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
334 {
335 	struct netem_sched_data *q = qdisc_priv(sch);
336 	/* We don't fill cb now as skb_unshare() may invalidate it */
337 	struct netem_skb_cb *cb;
338 	struct sk_buff *skb2;
339 	int ret;
340 	int count = 1;
341 
342 	/* Random duplication */
343 	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
344 		++count;
345 
346 	/* Drop packet? */
347 	if (loss_event(q))
348 		--count;
349 
350 	if (count == 0) {
351 		sch->qstats.drops++;
352 		kfree_skb(skb);
353 		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
354 	}
355 
356 	skb_orphan(skb);
357 
358 	/*
359 	 * If we need to duplicate packet, then re-insert at top of the
360 	 * qdisc tree, since parent queuer expects that only one
361 	 * skb will be queued.
362 	 */
363 	if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
364 		struct Qdisc *rootq = qdisc_root(sch);
365 		u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
366 		q->duplicate = 0;
367 
368 		qdisc_enqueue_root(skb2, rootq);
369 		q->duplicate = dupsave;
370 	}
371 
372 	/*
373 	 * Randomized packet corruption.
374 	 * Make copy if needed since we are modifying
375 	 * If packet is going to be hardware checksummed, then
376 	 * do it now in software before we mangle it.
377 	 */
378 	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
379 		if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
380 		    (skb->ip_summed == CHECKSUM_PARTIAL &&
381 		     skb_checksum_help(skb))) {
382 			sch->qstats.drops++;
383 			return NET_XMIT_DROP;
384 		}
385 
386 		skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
387 	}
388 
389 	cb = netem_skb_cb(skb);
390 	if (q->gap == 0 ||		/* not doing reordering */
391 	    q->counter < q->gap ||	/* inside last reordering gap */
392 	    q->reorder < get_crandom(&q->reorder_cor)) {
393 		psched_time_t now;
394 		psched_tdiff_t delay;
395 
396 		delay = tabledist(q->latency, q->jitter,
397 				  &q->delay_cor, q->delay_dist);
398 
399 		now = psched_get_time();
400 
401 		if (q->rate) {
402 			struct sk_buff_head *list = &q->qdisc->q;
403 
404 			delay += packet_len_2_sched_time(skb->len, q);
405 
406 			if (!skb_queue_empty(list)) {
407 				/*
408 				 * Last packet in queue is reference point (now).
409 				 * First packet in queue is already in flight,
410 				 * calculate this time bonus and substract
411 				 * from delay.
412 				 */
413 				delay -= now - netem_skb_cb(skb_peek(list))->time_to_send;
414 				now = netem_skb_cb(skb_peek_tail(list))->time_to_send;
415 			}
416 		}
417 
418 		cb->time_to_send = now + delay;
419 		++q->counter;
420 		ret = qdisc_enqueue(skb, q->qdisc);
421 	} else {
422 		/*
423 		 * Do re-ordering by putting one out of N packets at the front
424 		 * of the queue.
425 		 */
426 		cb->time_to_send = psched_get_time();
427 		q->counter = 0;
428 
429 		__skb_queue_head(&q->qdisc->q, skb);
430 		q->qdisc->qstats.backlog += qdisc_pkt_len(skb);
431 		q->qdisc->qstats.requeues++;
432 		ret = NET_XMIT_SUCCESS;
433 	}
434 
435 	if (ret != NET_XMIT_SUCCESS) {
436 		if (net_xmit_drop_count(ret)) {
437 			sch->qstats.drops++;
438 			return ret;
439 		}
440 	}
441 
442 	sch->q.qlen++;
443 	return NET_XMIT_SUCCESS;
444 }
445 
446 static unsigned int netem_drop(struct Qdisc *sch)
447 {
448 	struct netem_sched_data *q = qdisc_priv(sch);
449 	unsigned int len = 0;
450 
451 	if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
452 		sch->q.qlen--;
453 		sch->qstats.drops++;
454 	}
455 	return len;
456 }
457 
458 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
459 {
460 	struct netem_sched_data *q = qdisc_priv(sch);
461 	struct sk_buff *skb;
462 
463 	if (qdisc_is_throttled(sch))
464 		return NULL;
465 
466 	skb = q->qdisc->ops->peek(q->qdisc);
467 	if (skb) {
468 		const struct netem_skb_cb *cb = netem_skb_cb(skb);
469 		psched_time_t now = psched_get_time();
470 
471 		/* if more time remaining? */
472 		if (cb->time_to_send <= now) {
473 			skb = qdisc_dequeue_peeked(q->qdisc);
474 			if (unlikely(!skb))
475 				return NULL;
476 
477 #ifdef CONFIG_NET_CLS_ACT
478 			/*
479 			 * If it's at ingress let's pretend the delay is
480 			 * from the network (tstamp will be updated).
481 			 */
482 			if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
483 				skb->tstamp.tv64 = 0;
484 #endif
485 
486 			sch->q.qlen--;
487 			qdisc_unthrottled(sch);
488 			qdisc_bstats_update(sch, skb);
489 			return skb;
490 		}
491 
492 		qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);
493 	}
494 
495 	return NULL;
496 }
497 
498 static void netem_reset(struct Qdisc *sch)
499 {
500 	struct netem_sched_data *q = qdisc_priv(sch);
501 
502 	qdisc_reset(q->qdisc);
503 	sch->q.qlen = 0;
504 	qdisc_watchdog_cancel(&q->watchdog);
505 }
506 
507 static void dist_free(struct disttable *d)
508 {
509 	if (d) {
510 		if (is_vmalloc_addr(d))
511 			vfree(d);
512 		else
513 			kfree(d);
514 	}
515 }
516 
517 /*
518  * Distribution data is a variable size payload containing
519  * signed 16 bit values.
520  */
521 static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
522 {
523 	struct netem_sched_data *q = qdisc_priv(sch);
524 	size_t n = nla_len(attr)/sizeof(__s16);
525 	const __s16 *data = nla_data(attr);
526 	spinlock_t *root_lock;
527 	struct disttable *d;
528 	int i;
529 	size_t s;
530 
531 	if (n > NETEM_DIST_MAX)
532 		return -EINVAL;
533 
534 	s = sizeof(struct disttable) + n * sizeof(s16);
535 	d = kmalloc(s, GFP_KERNEL);
536 	if (!d)
537 		d = vmalloc(s);
538 	if (!d)
539 		return -ENOMEM;
540 
541 	d->size = n;
542 	for (i = 0; i < n; i++)
543 		d->table[i] = data[i];
544 
545 	root_lock = qdisc_root_sleeping_lock(sch);
546 
547 	spin_lock_bh(root_lock);
548 	dist_free(q->delay_dist);
549 	q->delay_dist = d;
550 	spin_unlock_bh(root_lock);
551 	return 0;
552 }
553 
554 static void get_correlation(struct Qdisc *sch, const struct nlattr *attr)
555 {
556 	struct netem_sched_data *q = qdisc_priv(sch);
557 	const struct tc_netem_corr *c = nla_data(attr);
558 
559 	init_crandom(&q->delay_cor, c->delay_corr);
560 	init_crandom(&q->loss_cor, c->loss_corr);
561 	init_crandom(&q->dup_cor, c->dup_corr);
562 }
563 
564 static void get_reorder(struct Qdisc *sch, const struct nlattr *attr)
565 {
566 	struct netem_sched_data *q = qdisc_priv(sch);
567 	const struct tc_netem_reorder *r = nla_data(attr);
568 
569 	q->reorder = r->probability;
570 	init_crandom(&q->reorder_cor, r->correlation);
571 }
572 
573 static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr)
574 {
575 	struct netem_sched_data *q = qdisc_priv(sch);
576 	const struct tc_netem_corrupt *r = nla_data(attr);
577 
578 	q->corrupt = r->probability;
579 	init_crandom(&q->corrupt_cor, r->correlation);
580 }
581 
582 static void get_rate(struct Qdisc *sch, const struct nlattr *attr)
583 {
584 	struct netem_sched_data *q = qdisc_priv(sch);
585 	const struct tc_netem_rate *r = nla_data(attr);
586 
587 	q->rate = r->rate;
588 	q->packet_overhead = r->packet_overhead;
589 	q->cell_size = r->cell_size;
590 	if (q->cell_size)
591 		q->cell_size_reciprocal = reciprocal_value(q->cell_size);
592 	q->cell_overhead = r->cell_overhead;
593 }
594 
595 static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
596 {
597 	struct netem_sched_data *q = qdisc_priv(sch);
598 	const struct nlattr *la;
599 	int rem;
600 
601 	nla_for_each_nested(la, attr, rem) {
602 		u16 type = nla_type(la);
603 
604 		switch(type) {
605 		case NETEM_LOSS_GI: {
606 			const struct tc_netem_gimodel *gi = nla_data(la);
607 
608 			if (nla_len(la) != sizeof(struct tc_netem_gimodel)) {
609 				pr_info("netem: incorrect gi model size\n");
610 				return -EINVAL;
611 			}
612 
613 			q->loss_model = CLG_4_STATES;
614 
615 			q->clg.state = 1;
616 			q->clg.a1 = gi->p13;
617 			q->clg.a2 = gi->p31;
618 			q->clg.a3 = gi->p32;
619 			q->clg.a4 = gi->p14;
620 			q->clg.a5 = gi->p23;
621 			break;
622 		}
623 
624 		case NETEM_LOSS_GE: {
625 			const struct tc_netem_gemodel *ge = nla_data(la);
626 
627 			if (nla_len(la) != sizeof(struct tc_netem_gemodel)) {
628 				pr_info("netem: incorrect gi model size\n");
629 				return -EINVAL;
630 			}
631 
632 			q->loss_model = CLG_GILB_ELL;
633 			q->clg.state = 1;
634 			q->clg.a1 = ge->p;
635 			q->clg.a2 = ge->r;
636 			q->clg.a3 = ge->h;
637 			q->clg.a4 = ge->k1;
638 			break;
639 		}
640 
641 		default:
642 			pr_info("netem: unknown loss type %u\n", type);
643 			return -EINVAL;
644 		}
645 	}
646 
647 	return 0;
648 }
649 
650 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
651 	[TCA_NETEM_CORR]	= { .len = sizeof(struct tc_netem_corr) },
652 	[TCA_NETEM_REORDER]	= { .len = sizeof(struct tc_netem_reorder) },
653 	[TCA_NETEM_CORRUPT]	= { .len = sizeof(struct tc_netem_corrupt) },
654 	[TCA_NETEM_RATE]	= { .len = sizeof(struct tc_netem_rate) },
655 	[TCA_NETEM_LOSS]	= { .type = NLA_NESTED },
656 };
657 
658 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
659 		      const struct nla_policy *policy, int len)
660 {
661 	int nested_len = nla_len(nla) - NLA_ALIGN(len);
662 
663 	if (nested_len < 0) {
664 		pr_info("netem: invalid attributes len %d\n", nested_len);
665 		return -EINVAL;
666 	}
667 
668 	if (nested_len >= nla_attr_size(0))
669 		return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
670 				 nested_len, policy);
671 
672 	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
673 	return 0;
674 }
675 
676 /* Parse netlink message to set options */
677 static int netem_change(struct Qdisc *sch, struct nlattr *opt)
678 {
679 	struct netem_sched_data *q = qdisc_priv(sch);
680 	struct nlattr *tb[TCA_NETEM_MAX + 1];
681 	struct tc_netem_qopt *qopt;
682 	int ret;
683 
684 	if (opt == NULL)
685 		return -EINVAL;
686 
687 	qopt = nla_data(opt);
688 	ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
689 	if (ret < 0)
690 		return ret;
691 
692 	ret = fifo_set_limit(q->qdisc, qopt->limit);
693 	if (ret) {
694 		pr_info("netem: can't set fifo limit\n");
695 		return ret;
696 	}
697 
698 	q->latency = qopt->latency;
699 	q->jitter = qopt->jitter;
700 	q->limit = qopt->limit;
701 	q->gap = qopt->gap;
702 	q->counter = 0;
703 	q->loss = qopt->loss;
704 	q->duplicate = qopt->duplicate;
705 
706 	/* for compatibility with earlier versions.
707 	 * if gap is set, need to assume 100% probability
708 	 */
709 	if (q->gap)
710 		q->reorder = ~0;
711 
712 	if (tb[TCA_NETEM_CORR])
713 		get_correlation(sch, tb[TCA_NETEM_CORR]);
714 
715 	if (tb[TCA_NETEM_DELAY_DIST]) {
716 		ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
717 		if (ret)
718 			return ret;
719 	}
720 
721 	if (tb[TCA_NETEM_REORDER])
722 		get_reorder(sch, tb[TCA_NETEM_REORDER]);
723 
724 	if (tb[TCA_NETEM_CORRUPT])
725 		get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
726 
727 	if (tb[TCA_NETEM_RATE])
728 		get_rate(sch, tb[TCA_NETEM_RATE]);
729 
730 	q->loss_model = CLG_RANDOM;
731 	if (tb[TCA_NETEM_LOSS])
732 		ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]);
733 
734 	return ret;
735 }
736 
737 /*
738  * Special case version of FIFO queue for use by netem.
739  * It queues in order based on timestamps in skb's
740  */
741 struct fifo_sched_data {
742 	u32 limit;
743 	psched_time_t oldest;
744 };
745 
746 static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
747 {
748 	struct fifo_sched_data *q = qdisc_priv(sch);
749 	struct sk_buff_head *list = &sch->q;
750 	psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
751 	struct sk_buff *skb;
752 
753 	if (likely(skb_queue_len(list) < q->limit)) {
754 		/* Optimize for add at tail */
755 		if (likely(skb_queue_empty(list) || tnext >= q->oldest)) {
756 			q->oldest = tnext;
757 			return qdisc_enqueue_tail(nskb, sch);
758 		}
759 
760 		skb_queue_reverse_walk(list, skb) {
761 			const struct netem_skb_cb *cb = netem_skb_cb(skb);
762 
763 			if (tnext >= cb->time_to_send)
764 				break;
765 		}
766 
767 		__skb_queue_after(list, skb, nskb);
768 
769 		sch->qstats.backlog += qdisc_pkt_len(nskb);
770 
771 		return NET_XMIT_SUCCESS;
772 	}
773 
774 	return qdisc_reshape_fail(nskb, sch);
775 }
776 
777 static int tfifo_init(struct Qdisc *sch, struct nlattr *opt)
778 {
779 	struct fifo_sched_data *q = qdisc_priv(sch);
780 
781 	if (opt) {
782 		struct tc_fifo_qopt *ctl = nla_data(opt);
783 		if (nla_len(opt) < sizeof(*ctl))
784 			return -EINVAL;
785 
786 		q->limit = ctl->limit;
787 	} else
788 		q->limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1);
789 
790 	q->oldest = PSCHED_PASTPERFECT;
791 	return 0;
792 }
793 
794 static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb)
795 {
796 	struct fifo_sched_data *q = qdisc_priv(sch);
797 	struct tc_fifo_qopt opt = { .limit = q->limit };
798 
799 	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
800 	return skb->len;
801 
802 nla_put_failure:
803 	return -1;
804 }
805 
806 static struct Qdisc_ops tfifo_qdisc_ops __read_mostly = {
807 	.id		=	"tfifo",
808 	.priv_size	=	sizeof(struct fifo_sched_data),
809 	.enqueue	=	tfifo_enqueue,
810 	.dequeue	=	qdisc_dequeue_head,
811 	.peek		=	qdisc_peek_head,
812 	.drop		=	qdisc_queue_drop,
813 	.init		=	tfifo_init,
814 	.reset		=	qdisc_reset_queue,
815 	.change		=	tfifo_init,
816 	.dump		=	tfifo_dump,
817 };
818 
819 static int netem_init(struct Qdisc *sch, struct nlattr *opt)
820 {
821 	struct netem_sched_data *q = qdisc_priv(sch);
822 	int ret;
823 
824 	if (!opt)
825 		return -EINVAL;
826 
827 	qdisc_watchdog_init(&q->watchdog, sch);
828 
829 	q->loss_model = CLG_RANDOM;
830 	q->qdisc = qdisc_create_dflt(sch->dev_queue, &tfifo_qdisc_ops,
831 				     TC_H_MAKE(sch->handle, 1));
832 	if (!q->qdisc) {
833 		pr_notice("netem: qdisc create tfifo qdisc failed\n");
834 		return -ENOMEM;
835 	}
836 
837 	ret = netem_change(sch, opt);
838 	if (ret) {
839 		pr_info("netem: change failed\n");
840 		qdisc_destroy(q->qdisc);
841 	}
842 	return ret;
843 }
844 
845 static void netem_destroy(struct Qdisc *sch)
846 {
847 	struct netem_sched_data *q = qdisc_priv(sch);
848 
849 	qdisc_watchdog_cancel(&q->watchdog);
850 	qdisc_destroy(q->qdisc);
851 	dist_free(q->delay_dist);
852 }
853 
854 static int dump_loss_model(const struct netem_sched_data *q,
855 			   struct sk_buff *skb)
856 {
857 	struct nlattr *nest;
858 
859 	nest = nla_nest_start(skb, TCA_NETEM_LOSS);
860 	if (nest == NULL)
861 		goto nla_put_failure;
862 
863 	switch (q->loss_model) {
864 	case CLG_RANDOM:
865 		/* legacy loss model */
866 		nla_nest_cancel(skb, nest);
867 		return 0;	/* no data */
868 
869 	case CLG_4_STATES: {
870 		struct tc_netem_gimodel gi = {
871 			.p13 = q->clg.a1,
872 			.p31 = q->clg.a2,
873 			.p32 = q->clg.a3,
874 			.p14 = q->clg.a4,
875 			.p23 = q->clg.a5,
876 		};
877 
878 		NLA_PUT(skb, NETEM_LOSS_GI, sizeof(gi), &gi);
879 		break;
880 	}
881 	case CLG_GILB_ELL: {
882 		struct tc_netem_gemodel ge = {
883 			.p = q->clg.a1,
884 			.r = q->clg.a2,
885 			.h = q->clg.a3,
886 			.k1 = q->clg.a4,
887 		};
888 
889 		NLA_PUT(skb, NETEM_LOSS_GE, sizeof(ge), &ge);
890 		break;
891 	}
892 	}
893 
894 	nla_nest_end(skb, nest);
895 	return 0;
896 
897 nla_put_failure:
898 	nla_nest_cancel(skb, nest);
899 	return -1;
900 }
901 
902 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
903 {
904 	const struct netem_sched_data *q = qdisc_priv(sch);
905 	struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
906 	struct tc_netem_qopt qopt;
907 	struct tc_netem_corr cor;
908 	struct tc_netem_reorder reorder;
909 	struct tc_netem_corrupt corrupt;
910 	struct tc_netem_rate rate;
911 
912 	qopt.latency = q->latency;
913 	qopt.jitter = q->jitter;
914 	qopt.limit = q->limit;
915 	qopt.loss = q->loss;
916 	qopt.gap = q->gap;
917 	qopt.duplicate = q->duplicate;
918 	NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
919 
920 	cor.delay_corr = q->delay_cor.rho;
921 	cor.loss_corr = q->loss_cor.rho;
922 	cor.dup_corr = q->dup_cor.rho;
923 	NLA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
924 
925 	reorder.probability = q->reorder;
926 	reorder.correlation = q->reorder_cor.rho;
927 	NLA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder);
928 
929 	corrupt.probability = q->corrupt;
930 	corrupt.correlation = q->corrupt_cor.rho;
931 	NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
932 
933 	rate.rate = q->rate;
934 	rate.packet_overhead = q->packet_overhead;
935 	rate.cell_size = q->cell_size;
936 	rate.cell_overhead = q->cell_overhead;
937 	NLA_PUT(skb, TCA_NETEM_RATE, sizeof(rate), &rate);
938 
939 	if (dump_loss_model(q, skb) != 0)
940 		goto nla_put_failure;
941 
942 	return nla_nest_end(skb, nla);
943 
944 nla_put_failure:
945 	nlmsg_trim(skb, nla);
946 	return -1;
947 }
948 
949 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
950 			  struct sk_buff *skb, struct tcmsg *tcm)
951 {
952 	struct netem_sched_data *q = qdisc_priv(sch);
953 
954 	if (cl != 1) 	/* only one class */
955 		return -ENOENT;
956 
957 	tcm->tcm_handle |= TC_H_MIN(1);
958 	tcm->tcm_info = q->qdisc->handle;
959 
960 	return 0;
961 }
962 
963 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
964 		     struct Qdisc **old)
965 {
966 	struct netem_sched_data *q = qdisc_priv(sch);
967 
968 	if (new == NULL)
969 		new = &noop_qdisc;
970 
971 	sch_tree_lock(sch);
972 	*old = q->qdisc;
973 	q->qdisc = new;
974 	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
975 	qdisc_reset(*old);
976 	sch_tree_unlock(sch);
977 
978 	return 0;
979 }
980 
981 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
982 {
983 	struct netem_sched_data *q = qdisc_priv(sch);
984 	return q->qdisc;
985 }
986 
987 static unsigned long netem_get(struct Qdisc *sch, u32 classid)
988 {
989 	return 1;
990 }
991 
992 static void netem_put(struct Qdisc *sch, unsigned long arg)
993 {
994 }
995 
996 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
997 {
998 	if (!walker->stop) {
999 		if (walker->count >= walker->skip)
1000 			if (walker->fn(sch, 1, walker) < 0) {
1001 				walker->stop = 1;
1002 				return;
1003 			}
1004 		walker->count++;
1005 	}
1006 }
1007 
1008 static const struct Qdisc_class_ops netem_class_ops = {
1009 	.graft		=	netem_graft,
1010 	.leaf		=	netem_leaf,
1011 	.get		=	netem_get,
1012 	.put		=	netem_put,
1013 	.walk		=	netem_walk,
1014 	.dump		=	netem_dump_class,
1015 };
1016 
1017 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
1018 	.id		=	"netem",
1019 	.cl_ops		=	&netem_class_ops,
1020 	.priv_size	=	sizeof(struct netem_sched_data),
1021 	.enqueue	=	netem_enqueue,
1022 	.dequeue	=	netem_dequeue,
1023 	.peek		=	qdisc_peek_dequeued,
1024 	.drop		=	netem_drop,
1025 	.init		=	netem_init,
1026 	.reset		=	netem_reset,
1027 	.destroy	=	netem_destroy,
1028 	.change		=	netem_change,
1029 	.dump		=	netem_dump,
1030 	.owner		=	THIS_MODULE,
1031 };
1032 
1033 
1034 static int __init netem_module_init(void)
1035 {
1036 	pr_info("netem: version " VERSION "\n");
1037 	return register_qdisc(&netem_qdisc_ops);
1038 }
1039 static void __exit netem_module_exit(void)
1040 {
1041 	unregister_qdisc(&netem_qdisc_ops);
1042 }
1043 module_init(netem_module_init)
1044 module_exit(netem_module_exit)
1045 MODULE_LICENSE("GPL");
1046