xref: /linux/net/sched/sch_netem.c (revision 26b0d14106954ae46d2f4f7eec3481828a210f7d)
1 /*
2  * net/sched/sch_netem.c	Network emulator
3  *
4  * 		This program is free software; you can redistribute it and/or
5  * 		modify it under the terms of the GNU General Public License
6  * 		as published by the Free Software Foundation; either version
7  * 		2 of the License.
8  *
9  *  		Many of the algorithms and ideas for this came from
10  *		NIST Net which is not copyrighted.
11  *
12  * Authors:	Stephen Hemminger <shemminger@osdl.org>
13  *		Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
14  */
15 
16 #include <linux/mm.h>
17 #include <linux/module.h>
18 #include <linux/slab.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/errno.h>
22 #include <linux/skbuff.h>
23 #include <linux/vmalloc.h>
24 #include <linux/rtnetlink.h>
25 #include <linux/reciprocal_div.h>
26 
27 #include <net/netlink.h>
28 #include <net/pkt_sched.h>
29 #include <net/inet_ecn.h>
30 
31 #define VERSION "1.3"
32 
33 /*	Network Emulation Queuing algorithm.
34 	====================================
35 
36 	Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
37 		 Network Emulation Tool
38 		 [2] Luigi Rizzo, DummyNet for FreeBSD
39 
40 	 ----------------------------------------------------------------
41 
42 	 This started out as a simple way to delay outgoing packets to
43 	 test TCP but has grown to include most of the functionality
44 	 of a full blown network emulator like NISTnet. It can delay
45 	 packets and add random jitter (and correlation). The random
46 	 distribution can be loaded from a table as well to provide
47 	 normal, Pareto, or experimental curves. Packet loss,
48 	 duplication, and reordering can also be emulated.
49 
50 	 This qdisc does not do classification that can be handled in
51 	 layering other disciplines.  It does not need to do bandwidth
52 	 control either since that can be handled by using token
53 	 bucket or other rate control.
54 
55      Correlated Loss Generator models
56 
57 	Added generation of correlated loss according to the
58 	"Gilbert-Elliot" model, a 4-state markov model.
59 
60 	References:
61 	[1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
62 	[2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
63 	and intuitive loss model for packet networks and its implementation
64 	in the Netem module in the Linux kernel", available in [1]
65 
66 	Authors: Stefano Salsano <stefano.salsano at uniroma2.it
67 		 Fabio Ludovici <fabio.ludovici at yahoo.it>
68 */
69 
70 struct netem_sched_data {
71 	/* internal t(ime)fifo qdisc uses sch->q and sch->limit */
72 
73 	/* optional qdisc for classful handling (NULL at netem init) */
74 	struct Qdisc	*qdisc;
75 
76 	struct qdisc_watchdog watchdog;
77 
78 	psched_tdiff_t latency;
79 	psched_tdiff_t jitter;
80 
81 	u32 loss;
82 	u32 ecn;
83 	u32 limit;
84 	u32 counter;
85 	u32 gap;
86 	u32 duplicate;
87 	u32 reorder;
88 	u32 corrupt;
89 	u32 rate;
90 	s32 packet_overhead;
91 	u32 cell_size;
92 	u32 cell_size_reciprocal;
93 	s32 cell_overhead;
94 
95 	struct crndstate {
96 		u32 last;
97 		u32 rho;
98 	} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
99 
100 	struct disttable {
101 		u32  size;
102 		s16 table[0];
103 	} *delay_dist;
104 
105 	enum  {
106 		CLG_RANDOM,
107 		CLG_4_STATES,
108 		CLG_GILB_ELL,
109 	} loss_model;
110 
111 	/* Correlated Loss Generation models */
112 	struct clgstate {
113 		/* state of the Markov chain */
114 		u8 state;
115 
116 		/* 4-states and Gilbert-Elliot models */
117 		u32 a1;	/* p13 for 4-states or p for GE */
118 		u32 a2;	/* p31 for 4-states or r for GE */
119 		u32 a3;	/* p32 for 4-states or h for GE */
120 		u32 a4;	/* p14 for 4-states or 1-k for GE */
121 		u32 a5; /* p23 used only in 4-states */
122 	} clg;
123 
124 };
125 
126 /* Time stamp put into socket buffer control block
127  * Only valid when skbs are in our internal t(ime)fifo queue.
128  */
129 struct netem_skb_cb {
130 	psched_time_t	time_to_send;
131 };
132 
133 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
134 {
135 	qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
136 	return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
137 }
138 
139 /* init_crandom - initialize correlated random number generator
140  * Use entropy source for initial seed.
141  */
142 static void init_crandom(struct crndstate *state, unsigned long rho)
143 {
144 	state->rho = rho;
145 	state->last = net_random();
146 }
147 
148 /* get_crandom - correlated random number generator
149  * Next number depends on last value.
150  * rho is scaled to avoid floating point.
151  */
152 static u32 get_crandom(struct crndstate *state)
153 {
154 	u64 value, rho;
155 	unsigned long answer;
156 
157 	if (state->rho == 0)	/* no correlation */
158 		return net_random();
159 
160 	value = net_random();
161 	rho = (u64)state->rho + 1;
162 	answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
163 	state->last = answer;
164 	return answer;
165 }
166 
167 /* loss_4state - 4-state model loss generator
168  * Generates losses according to the 4-state Markov chain adopted in
169  * the GI (General and Intuitive) loss model.
170  */
171 static bool loss_4state(struct netem_sched_data *q)
172 {
173 	struct clgstate *clg = &q->clg;
174 	u32 rnd = net_random();
175 
176 	/*
177 	 * Makes a comparison between rnd and the transition
178 	 * probabilities outgoing from the current state, then decides the
179 	 * next state and if the next packet has to be transmitted or lost.
180 	 * The four states correspond to:
181 	 *   1 => successfully transmitted packets within a gap period
182 	 *   4 => isolated losses within a gap period
183 	 *   3 => lost packets within a burst period
184 	 *   2 => successfully transmitted packets within a burst period
185 	 */
186 	switch (clg->state) {
187 	case 1:
188 		if (rnd < clg->a4) {
189 			clg->state = 4;
190 			return true;
191 		} else if (clg->a4 < rnd && rnd < clg->a1) {
192 			clg->state = 3;
193 			return true;
194 		} else if (clg->a1 < rnd)
195 			clg->state = 1;
196 
197 		break;
198 	case 2:
199 		if (rnd < clg->a5) {
200 			clg->state = 3;
201 			return true;
202 		} else
203 			clg->state = 2;
204 
205 		break;
206 	case 3:
207 		if (rnd < clg->a3)
208 			clg->state = 2;
209 		else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
210 			clg->state = 1;
211 			return true;
212 		} else if (clg->a2 + clg->a3 < rnd) {
213 			clg->state = 3;
214 			return true;
215 		}
216 		break;
217 	case 4:
218 		clg->state = 1;
219 		break;
220 	}
221 
222 	return false;
223 }
224 
225 /* loss_gilb_ell - Gilbert-Elliot model loss generator
226  * Generates losses according to the Gilbert-Elliot loss model or
227  * its special cases  (Gilbert or Simple Gilbert)
228  *
229  * Makes a comparison between random number and the transition
230  * probabilities outgoing from the current state, then decides the
231  * next state. A second random number is extracted and the comparison
232  * with the loss probability of the current state decides if the next
233  * packet will be transmitted or lost.
234  */
235 static bool loss_gilb_ell(struct netem_sched_data *q)
236 {
237 	struct clgstate *clg = &q->clg;
238 
239 	switch (clg->state) {
240 	case 1:
241 		if (net_random() < clg->a1)
242 			clg->state = 2;
243 		if (net_random() < clg->a4)
244 			return true;
245 	case 2:
246 		if (net_random() < clg->a2)
247 			clg->state = 1;
248 		if (clg->a3 > net_random())
249 			return true;
250 	}
251 
252 	return false;
253 }
254 
255 static bool loss_event(struct netem_sched_data *q)
256 {
257 	switch (q->loss_model) {
258 	case CLG_RANDOM:
259 		/* Random packet drop 0 => none, ~0 => all */
260 		return q->loss && q->loss >= get_crandom(&q->loss_cor);
261 
262 	case CLG_4_STATES:
263 		/* 4state loss model algorithm (used also for GI model)
264 		* Extracts a value from the markov 4 state loss generator,
265 		* if it is 1 drops a packet and if needed writes the event in
266 		* the kernel logs
267 		*/
268 		return loss_4state(q);
269 
270 	case CLG_GILB_ELL:
271 		/* Gilbert-Elliot loss model algorithm
272 		* Extracts a value from the Gilbert-Elliot loss generator,
273 		* if it is 1 drops a packet and if needed writes the event in
274 		* the kernel logs
275 		*/
276 		return loss_gilb_ell(q);
277 	}
278 
279 	return false;	/* not reached */
280 }
281 
282 
283 /* tabledist - return a pseudo-randomly distributed value with mean mu and
284  * std deviation sigma.  Uses table lookup to approximate the desired
285  * distribution, and a uniformly-distributed pseudo-random source.
286  */
287 static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
288 				struct crndstate *state,
289 				const struct disttable *dist)
290 {
291 	psched_tdiff_t x;
292 	long t;
293 	u32 rnd;
294 
295 	if (sigma == 0)
296 		return mu;
297 
298 	rnd = get_crandom(state);
299 
300 	/* default uniform distribution */
301 	if (dist == NULL)
302 		return (rnd % (2*sigma)) - sigma + mu;
303 
304 	t = dist->table[rnd % dist->size];
305 	x = (sigma % NETEM_DIST_SCALE) * t;
306 	if (x >= 0)
307 		x += NETEM_DIST_SCALE/2;
308 	else
309 		x -= NETEM_DIST_SCALE/2;
310 
311 	return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
312 }
313 
314 static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q)
315 {
316 	u64 ticks;
317 
318 	len += q->packet_overhead;
319 
320 	if (q->cell_size) {
321 		u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
322 
323 		if (len > cells * q->cell_size)	/* extra cell needed for remainder */
324 			cells++;
325 		len = cells * (q->cell_size + q->cell_overhead);
326 	}
327 
328 	ticks = (u64)len * NSEC_PER_SEC;
329 
330 	do_div(ticks, q->rate);
331 	return PSCHED_NS2TICKS(ticks);
332 }
333 
334 static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
335 {
336 	struct sk_buff_head *list = &sch->q;
337 	psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
338 	struct sk_buff *skb;
339 
340 	if (likely(skb_queue_len(list) < sch->limit)) {
341 		skb = skb_peek_tail(list);
342 		/* Optimize for add at tail */
343 		if (likely(!skb || tnext >= netem_skb_cb(skb)->time_to_send))
344 			return qdisc_enqueue_tail(nskb, sch);
345 
346 		skb_queue_reverse_walk(list, skb) {
347 			if (tnext >= netem_skb_cb(skb)->time_to_send)
348 				break;
349 		}
350 
351 		__skb_queue_after(list, skb, nskb);
352 		sch->qstats.backlog += qdisc_pkt_len(nskb);
353 		return NET_XMIT_SUCCESS;
354 	}
355 
356 	return qdisc_reshape_fail(nskb, sch);
357 }
358 
359 /*
360  * Insert one skb into qdisc.
361  * Note: parent depends on return value to account for queue length.
362  * 	NET_XMIT_DROP: queue length didn't change.
363  *      NET_XMIT_SUCCESS: one skb was queued.
364  */
365 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
366 {
367 	struct netem_sched_data *q = qdisc_priv(sch);
368 	/* We don't fill cb now as skb_unshare() may invalidate it */
369 	struct netem_skb_cb *cb;
370 	struct sk_buff *skb2;
371 	int ret;
372 	int count = 1;
373 
374 	/* Random duplication */
375 	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
376 		++count;
377 
378 	/* Drop packet? */
379 	if (loss_event(q)) {
380 		if (q->ecn && INET_ECN_set_ce(skb))
381 			sch->qstats.drops++; /* mark packet */
382 		else
383 			--count;
384 	}
385 	if (count == 0) {
386 		sch->qstats.drops++;
387 		kfree_skb(skb);
388 		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
389 	}
390 
391 	skb_orphan(skb);
392 
393 	/*
394 	 * If we need to duplicate packet, then re-insert at top of the
395 	 * qdisc tree, since parent queuer expects that only one
396 	 * skb will be queued.
397 	 */
398 	if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
399 		struct Qdisc *rootq = qdisc_root(sch);
400 		u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
401 		q->duplicate = 0;
402 
403 		qdisc_enqueue_root(skb2, rootq);
404 		q->duplicate = dupsave;
405 	}
406 
407 	/*
408 	 * Randomized packet corruption.
409 	 * Make copy if needed since we are modifying
410 	 * If packet is going to be hardware checksummed, then
411 	 * do it now in software before we mangle it.
412 	 */
413 	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
414 		if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
415 		    (skb->ip_summed == CHECKSUM_PARTIAL &&
416 		     skb_checksum_help(skb)))
417 			return qdisc_drop(skb, sch);
418 
419 		skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
420 	}
421 
422 	cb = netem_skb_cb(skb);
423 	if (q->gap == 0 ||		/* not doing reordering */
424 	    q->counter < q->gap - 1 ||	/* inside last reordering gap */
425 	    q->reorder < get_crandom(&q->reorder_cor)) {
426 		psched_time_t now;
427 		psched_tdiff_t delay;
428 
429 		delay = tabledist(q->latency, q->jitter,
430 				  &q->delay_cor, q->delay_dist);
431 
432 		now = psched_get_time();
433 
434 		if (q->rate) {
435 			struct sk_buff_head *list = &sch->q;
436 
437 			delay += packet_len_2_sched_time(skb->len, q);
438 
439 			if (!skb_queue_empty(list)) {
440 				/*
441 				 * Last packet in queue is reference point (now).
442 				 * First packet in queue is already in flight,
443 				 * calculate this time bonus and substract
444 				 * from delay.
445 				 */
446 				delay -= now - netem_skb_cb(skb_peek(list))->time_to_send;
447 				now = netem_skb_cb(skb_peek_tail(list))->time_to_send;
448 			}
449 		}
450 
451 		cb->time_to_send = now + delay;
452 		++q->counter;
453 		ret = tfifo_enqueue(skb, sch);
454 	} else {
455 		/*
456 		 * Do re-ordering by putting one out of N packets at the front
457 		 * of the queue.
458 		 */
459 		cb->time_to_send = psched_get_time();
460 		q->counter = 0;
461 
462 		__skb_queue_head(&sch->q, skb);
463 		sch->qstats.backlog += qdisc_pkt_len(skb);
464 		sch->qstats.requeues++;
465 		ret = NET_XMIT_SUCCESS;
466 	}
467 
468 	if (ret != NET_XMIT_SUCCESS) {
469 		if (net_xmit_drop_count(ret)) {
470 			sch->qstats.drops++;
471 			return ret;
472 		}
473 	}
474 
475 	return NET_XMIT_SUCCESS;
476 }
477 
478 static unsigned int netem_drop(struct Qdisc *sch)
479 {
480 	struct netem_sched_data *q = qdisc_priv(sch);
481 	unsigned int len;
482 
483 	len = qdisc_queue_drop(sch);
484 	if (!len && q->qdisc && q->qdisc->ops->drop)
485 	    len = q->qdisc->ops->drop(q->qdisc);
486 	if (len)
487 		sch->qstats.drops++;
488 
489 	return len;
490 }
491 
492 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
493 {
494 	struct netem_sched_data *q = qdisc_priv(sch);
495 	struct sk_buff *skb;
496 
497 	if (qdisc_is_throttled(sch))
498 		return NULL;
499 
500 tfifo_dequeue:
501 	skb = qdisc_peek_head(sch);
502 	if (skb) {
503 		const struct netem_skb_cb *cb = netem_skb_cb(skb);
504 
505 		/* if more time remaining? */
506 		if (cb->time_to_send <= psched_get_time()) {
507 			__skb_unlink(skb, &sch->q);
508 			sch->qstats.backlog -= qdisc_pkt_len(skb);
509 
510 #ifdef CONFIG_NET_CLS_ACT
511 			/*
512 			 * If it's at ingress let's pretend the delay is
513 			 * from the network (tstamp will be updated).
514 			 */
515 			if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
516 				skb->tstamp.tv64 = 0;
517 #endif
518 
519 			if (q->qdisc) {
520 				int err = qdisc_enqueue(skb, q->qdisc);
521 
522 				if (unlikely(err != NET_XMIT_SUCCESS)) {
523 					if (net_xmit_drop_count(err)) {
524 						sch->qstats.drops++;
525 						qdisc_tree_decrease_qlen(sch, 1);
526 					}
527 				}
528 				goto tfifo_dequeue;
529 			}
530 deliver:
531 			qdisc_unthrottled(sch);
532 			qdisc_bstats_update(sch, skb);
533 			return skb;
534 		}
535 
536 		if (q->qdisc) {
537 			skb = q->qdisc->ops->dequeue(q->qdisc);
538 			if (skb)
539 				goto deliver;
540 		}
541 		qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);
542 	}
543 
544 	if (q->qdisc) {
545 		skb = q->qdisc->ops->dequeue(q->qdisc);
546 		if (skb)
547 			goto deliver;
548 	}
549 	return NULL;
550 }
551 
552 static void netem_reset(struct Qdisc *sch)
553 {
554 	struct netem_sched_data *q = qdisc_priv(sch);
555 
556 	qdisc_reset_queue(sch);
557 	if (q->qdisc)
558 		qdisc_reset(q->qdisc);
559 	qdisc_watchdog_cancel(&q->watchdog);
560 }
561 
562 static void dist_free(struct disttable *d)
563 {
564 	if (d) {
565 		if (is_vmalloc_addr(d))
566 			vfree(d);
567 		else
568 			kfree(d);
569 	}
570 }
571 
572 /*
573  * Distribution data is a variable size payload containing
574  * signed 16 bit values.
575  */
576 static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
577 {
578 	struct netem_sched_data *q = qdisc_priv(sch);
579 	size_t n = nla_len(attr)/sizeof(__s16);
580 	const __s16 *data = nla_data(attr);
581 	spinlock_t *root_lock;
582 	struct disttable *d;
583 	int i;
584 	size_t s;
585 
586 	if (n > NETEM_DIST_MAX)
587 		return -EINVAL;
588 
589 	s = sizeof(struct disttable) + n * sizeof(s16);
590 	d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN);
591 	if (!d)
592 		d = vmalloc(s);
593 	if (!d)
594 		return -ENOMEM;
595 
596 	d->size = n;
597 	for (i = 0; i < n; i++)
598 		d->table[i] = data[i];
599 
600 	root_lock = qdisc_root_sleeping_lock(sch);
601 
602 	spin_lock_bh(root_lock);
603 	swap(q->delay_dist, d);
604 	spin_unlock_bh(root_lock);
605 
606 	dist_free(d);
607 	return 0;
608 }
609 
610 static void get_correlation(struct Qdisc *sch, const struct nlattr *attr)
611 {
612 	struct netem_sched_data *q = qdisc_priv(sch);
613 	const struct tc_netem_corr *c = nla_data(attr);
614 
615 	init_crandom(&q->delay_cor, c->delay_corr);
616 	init_crandom(&q->loss_cor, c->loss_corr);
617 	init_crandom(&q->dup_cor, c->dup_corr);
618 }
619 
620 static void get_reorder(struct Qdisc *sch, const struct nlattr *attr)
621 {
622 	struct netem_sched_data *q = qdisc_priv(sch);
623 	const struct tc_netem_reorder *r = nla_data(attr);
624 
625 	q->reorder = r->probability;
626 	init_crandom(&q->reorder_cor, r->correlation);
627 }
628 
629 static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr)
630 {
631 	struct netem_sched_data *q = qdisc_priv(sch);
632 	const struct tc_netem_corrupt *r = nla_data(attr);
633 
634 	q->corrupt = r->probability;
635 	init_crandom(&q->corrupt_cor, r->correlation);
636 }
637 
638 static void get_rate(struct Qdisc *sch, const struct nlattr *attr)
639 {
640 	struct netem_sched_data *q = qdisc_priv(sch);
641 	const struct tc_netem_rate *r = nla_data(attr);
642 
643 	q->rate = r->rate;
644 	q->packet_overhead = r->packet_overhead;
645 	q->cell_size = r->cell_size;
646 	if (q->cell_size)
647 		q->cell_size_reciprocal = reciprocal_value(q->cell_size);
648 	q->cell_overhead = r->cell_overhead;
649 }
650 
651 static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
652 {
653 	struct netem_sched_data *q = qdisc_priv(sch);
654 	const struct nlattr *la;
655 	int rem;
656 
657 	nla_for_each_nested(la, attr, rem) {
658 		u16 type = nla_type(la);
659 
660 		switch(type) {
661 		case NETEM_LOSS_GI: {
662 			const struct tc_netem_gimodel *gi = nla_data(la);
663 
664 			if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
665 				pr_info("netem: incorrect gi model size\n");
666 				return -EINVAL;
667 			}
668 
669 			q->loss_model = CLG_4_STATES;
670 
671 			q->clg.state = 1;
672 			q->clg.a1 = gi->p13;
673 			q->clg.a2 = gi->p31;
674 			q->clg.a3 = gi->p32;
675 			q->clg.a4 = gi->p14;
676 			q->clg.a5 = gi->p23;
677 			break;
678 		}
679 
680 		case NETEM_LOSS_GE: {
681 			const struct tc_netem_gemodel *ge = nla_data(la);
682 
683 			if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
684 				pr_info("netem: incorrect ge model size\n");
685 				return -EINVAL;
686 			}
687 
688 			q->loss_model = CLG_GILB_ELL;
689 			q->clg.state = 1;
690 			q->clg.a1 = ge->p;
691 			q->clg.a2 = ge->r;
692 			q->clg.a3 = ge->h;
693 			q->clg.a4 = ge->k1;
694 			break;
695 		}
696 
697 		default:
698 			pr_info("netem: unknown loss type %u\n", type);
699 			return -EINVAL;
700 		}
701 	}
702 
703 	return 0;
704 }
705 
706 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
707 	[TCA_NETEM_CORR]	= { .len = sizeof(struct tc_netem_corr) },
708 	[TCA_NETEM_REORDER]	= { .len = sizeof(struct tc_netem_reorder) },
709 	[TCA_NETEM_CORRUPT]	= { .len = sizeof(struct tc_netem_corrupt) },
710 	[TCA_NETEM_RATE]	= { .len = sizeof(struct tc_netem_rate) },
711 	[TCA_NETEM_LOSS]	= { .type = NLA_NESTED },
712 	[TCA_NETEM_ECN]		= { .type = NLA_U32 },
713 };
714 
715 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
716 		      const struct nla_policy *policy, int len)
717 {
718 	int nested_len = nla_len(nla) - NLA_ALIGN(len);
719 
720 	if (nested_len < 0) {
721 		pr_info("netem: invalid attributes len %d\n", nested_len);
722 		return -EINVAL;
723 	}
724 
725 	if (nested_len >= nla_attr_size(0))
726 		return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
727 				 nested_len, policy);
728 
729 	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
730 	return 0;
731 }
732 
733 /* Parse netlink message to set options */
734 static int netem_change(struct Qdisc *sch, struct nlattr *opt)
735 {
736 	struct netem_sched_data *q = qdisc_priv(sch);
737 	struct nlattr *tb[TCA_NETEM_MAX + 1];
738 	struct tc_netem_qopt *qopt;
739 	int ret;
740 
741 	if (opt == NULL)
742 		return -EINVAL;
743 
744 	qopt = nla_data(opt);
745 	ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
746 	if (ret < 0)
747 		return ret;
748 
749 	sch->limit = qopt->limit;
750 
751 	q->latency = qopt->latency;
752 	q->jitter = qopt->jitter;
753 	q->limit = qopt->limit;
754 	q->gap = qopt->gap;
755 	q->counter = 0;
756 	q->loss = qopt->loss;
757 	q->duplicate = qopt->duplicate;
758 
759 	/* for compatibility with earlier versions.
760 	 * if gap is set, need to assume 100% probability
761 	 */
762 	if (q->gap)
763 		q->reorder = ~0;
764 
765 	if (tb[TCA_NETEM_CORR])
766 		get_correlation(sch, tb[TCA_NETEM_CORR]);
767 
768 	if (tb[TCA_NETEM_DELAY_DIST]) {
769 		ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
770 		if (ret)
771 			return ret;
772 	}
773 
774 	if (tb[TCA_NETEM_REORDER])
775 		get_reorder(sch, tb[TCA_NETEM_REORDER]);
776 
777 	if (tb[TCA_NETEM_CORRUPT])
778 		get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
779 
780 	if (tb[TCA_NETEM_RATE])
781 		get_rate(sch, tb[TCA_NETEM_RATE]);
782 
783 	if (tb[TCA_NETEM_ECN])
784 		q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
785 
786 	q->loss_model = CLG_RANDOM;
787 	if (tb[TCA_NETEM_LOSS])
788 		ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]);
789 
790 	return ret;
791 }
792 
793 static int netem_init(struct Qdisc *sch, struct nlattr *opt)
794 {
795 	struct netem_sched_data *q = qdisc_priv(sch);
796 	int ret;
797 
798 	if (!opt)
799 		return -EINVAL;
800 
801 	qdisc_watchdog_init(&q->watchdog, sch);
802 
803 	q->loss_model = CLG_RANDOM;
804 	ret = netem_change(sch, opt);
805 	if (ret)
806 		pr_info("netem: change failed\n");
807 	return ret;
808 }
809 
810 static void netem_destroy(struct Qdisc *sch)
811 {
812 	struct netem_sched_data *q = qdisc_priv(sch);
813 
814 	qdisc_watchdog_cancel(&q->watchdog);
815 	if (q->qdisc)
816 		qdisc_destroy(q->qdisc);
817 	dist_free(q->delay_dist);
818 }
819 
820 static int dump_loss_model(const struct netem_sched_data *q,
821 			   struct sk_buff *skb)
822 {
823 	struct nlattr *nest;
824 
825 	nest = nla_nest_start(skb, TCA_NETEM_LOSS);
826 	if (nest == NULL)
827 		goto nla_put_failure;
828 
829 	switch (q->loss_model) {
830 	case CLG_RANDOM:
831 		/* legacy loss model */
832 		nla_nest_cancel(skb, nest);
833 		return 0;	/* no data */
834 
835 	case CLG_4_STATES: {
836 		struct tc_netem_gimodel gi = {
837 			.p13 = q->clg.a1,
838 			.p31 = q->clg.a2,
839 			.p32 = q->clg.a3,
840 			.p14 = q->clg.a4,
841 			.p23 = q->clg.a5,
842 		};
843 
844 		if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
845 			goto nla_put_failure;
846 		break;
847 	}
848 	case CLG_GILB_ELL: {
849 		struct tc_netem_gemodel ge = {
850 			.p = q->clg.a1,
851 			.r = q->clg.a2,
852 			.h = q->clg.a3,
853 			.k1 = q->clg.a4,
854 		};
855 
856 		if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
857 			goto nla_put_failure;
858 		break;
859 	}
860 	}
861 
862 	nla_nest_end(skb, nest);
863 	return 0;
864 
865 nla_put_failure:
866 	nla_nest_cancel(skb, nest);
867 	return -1;
868 }
869 
870 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
871 {
872 	const struct netem_sched_data *q = qdisc_priv(sch);
873 	struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
874 	struct tc_netem_qopt qopt;
875 	struct tc_netem_corr cor;
876 	struct tc_netem_reorder reorder;
877 	struct tc_netem_corrupt corrupt;
878 	struct tc_netem_rate rate;
879 
880 	qopt.latency = q->latency;
881 	qopt.jitter = q->jitter;
882 	qopt.limit = q->limit;
883 	qopt.loss = q->loss;
884 	qopt.gap = q->gap;
885 	qopt.duplicate = q->duplicate;
886 	if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
887 		goto nla_put_failure;
888 
889 	cor.delay_corr = q->delay_cor.rho;
890 	cor.loss_corr = q->loss_cor.rho;
891 	cor.dup_corr = q->dup_cor.rho;
892 	if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
893 		goto nla_put_failure;
894 
895 	reorder.probability = q->reorder;
896 	reorder.correlation = q->reorder_cor.rho;
897 	if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
898 		goto nla_put_failure;
899 
900 	corrupt.probability = q->corrupt;
901 	corrupt.correlation = q->corrupt_cor.rho;
902 	if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
903 		goto nla_put_failure;
904 
905 	rate.rate = q->rate;
906 	rate.packet_overhead = q->packet_overhead;
907 	rate.cell_size = q->cell_size;
908 	rate.cell_overhead = q->cell_overhead;
909 	if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
910 		goto nla_put_failure;
911 
912 	if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
913 		goto nla_put_failure;
914 
915 	if (dump_loss_model(q, skb) != 0)
916 		goto nla_put_failure;
917 
918 	return nla_nest_end(skb, nla);
919 
920 nla_put_failure:
921 	nlmsg_trim(skb, nla);
922 	return -1;
923 }
924 
925 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
926 			  struct sk_buff *skb, struct tcmsg *tcm)
927 {
928 	struct netem_sched_data *q = qdisc_priv(sch);
929 
930 	if (cl != 1 || !q->qdisc) 	/* only one class */
931 		return -ENOENT;
932 
933 	tcm->tcm_handle |= TC_H_MIN(1);
934 	tcm->tcm_info = q->qdisc->handle;
935 
936 	return 0;
937 }
938 
939 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
940 		     struct Qdisc **old)
941 {
942 	struct netem_sched_data *q = qdisc_priv(sch);
943 
944 	sch_tree_lock(sch);
945 	*old = q->qdisc;
946 	q->qdisc = new;
947 	if (*old) {
948 		qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
949 		qdisc_reset(*old);
950 	}
951 	sch_tree_unlock(sch);
952 
953 	return 0;
954 }
955 
956 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
957 {
958 	struct netem_sched_data *q = qdisc_priv(sch);
959 	return q->qdisc;
960 }
961 
962 static unsigned long netem_get(struct Qdisc *sch, u32 classid)
963 {
964 	return 1;
965 }
966 
967 static void netem_put(struct Qdisc *sch, unsigned long arg)
968 {
969 }
970 
971 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
972 {
973 	if (!walker->stop) {
974 		if (walker->count >= walker->skip)
975 			if (walker->fn(sch, 1, walker) < 0) {
976 				walker->stop = 1;
977 				return;
978 			}
979 		walker->count++;
980 	}
981 }
982 
983 static const struct Qdisc_class_ops netem_class_ops = {
984 	.graft		=	netem_graft,
985 	.leaf		=	netem_leaf,
986 	.get		=	netem_get,
987 	.put		=	netem_put,
988 	.walk		=	netem_walk,
989 	.dump		=	netem_dump_class,
990 };
991 
992 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
993 	.id		=	"netem",
994 	.cl_ops		=	&netem_class_ops,
995 	.priv_size	=	sizeof(struct netem_sched_data),
996 	.enqueue	=	netem_enqueue,
997 	.dequeue	=	netem_dequeue,
998 	.peek		=	qdisc_peek_dequeued,
999 	.drop		=	netem_drop,
1000 	.init		=	netem_init,
1001 	.reset		=	netem_reset,
1002 	.destroy	=	netem_destroy,
1003 	.change		=	netem_change,
1004 	.dump		=	netem_dump,
1005 	.owner		=	THIS_MODULE,
1006 };
1007 
1008 
1009 static int __init netem_module_init(void)
1010 {
1011 	pr_info("netem: version " VERSION "\n");
1012 	return register_qdisc(&netem_qdisc_ops);
1013 }
1014 static void __exit netem_module_exit(void)
1015 {
1016 	unregister_qdisc(&netem_qdisc_ops);
1017 }
1018 module_init(netem_module_init)
1019 module_exit(netem_module_exit)
1020 MODULE_LICENSE("GPL");
1021