xref: /linux/net/sched/sch_netem.c (revision d584e204ff574b43d4dcaa87ae233c4e9d08e1fb)
1 /*
2  * net/sched/sch_netem.c	Network emulator
3  *
4  * 		This program is free software; you can redistribute it and/or
5  * 		modify it under the terms of the GNU General Public License
6  * 		as published by the Free Software Foundation; either version
7  * 		2 of the License.
8  *
9  *  		Many of the algorithms and ideas for this came from
10  *		NIST Net which is not copyrighted.
11  *
12  * Authors:	Stephen Hemminger <shemminger@osdl.org>
13  *		Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
14  */
15 
16 #include <linux/mm.h>
17 #include <linux/module.h>
18 #include <linux/slab.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/errno.h>
22 #include <linux/skbuff.h>
23 #include <linux/vmalloc.h>
24 #include <linux/rtnetlink.h>
25 #include <linux/reciprocal_div.h>
26 
27 #include <net/netlink.h>
28 #include <net/pkt_sched.h>
29 
30 #define VERSION "1.3"
31 
32 /*	Network Emulation Queuing algorithm.
33 	====================================
34 
35 	Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
36 		 Network Emulation Tool
37 		 [2] Luigi Rizzo, DummyNet for FreeBSD
38 
39 	 ----------------------------------------------------------------
40 
41 	 This started out as a simple way to delay outgoing packets to
42 	 test TCP but has grown to include most of the functionality
43 	 of a full blown network emulator like NISTnet. It can delay
44 	 packets and add random jitter (and correlation). The random
45 	 distribution can be loaded from a table as well to provide
46 	 normal, Pareto, or experimental curves. Packet loss,
47 	 duplication, and reordering can also be emulated.
48 
49 	 This qdisc does not do classification that can be handled in
50 	 layering other disciplines.  It does not need to do bandwidth
51 	 control either since that can be handled by using token
52 	 bucket or other rate control.
53 
54      Correlated Loss Generator models
55 
56 	Added generation of correlated loss according to the
57 	"Gilbert-Elliot" model, a 4-state markov model.
58 
59 	References:
60 	[1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
61 	[2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
62 	and intuitive loss model for packet networks and its implementation
63 	in the Netem module in the Linux kernel", available in [1]
64 
65 	Authors: Stefano Salsano <stefano.salsano at uniroma2.it
66 		 Fabio Ludovici <fabio.ludovici at yahoo.it>
67 */
68 
69 struct netem_sched_data {
70 	/* internal t(ime)fifo qdisc uses sch->q and sch->limit */
71 
72 	/* optional qdisc for classful handling (NULL at netem init) */
73 	struct Qdisc	*qdisc;
74 
75 	struct qdisc_watchdog watchdog;
76 
77 	psched_tdiff_t latency;
78 	psched_tdiff_t jitter;
79 
80 	u32 loss;
81 	u32 limit;
82 	u32 counter;
83 	u32 gap;
84 	u32 duplicate;
85 	u32 reorder;
86 	u32 corrupt;
87 	u32 rate;
88 	s32 packet_overhead;
89 	u32 cell_size;
90 	u32 cell_size_reciprocal;
91 	s32 cell_overhead;
92 
93 	struct crndstate {
94 		u32 last;
95 		u32 rho;
96 	} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
97 
98 	struct disttable {
99 		u32  size;
100 		s16 table[0];
101 	} *delay_dist;
102 
103 	enum  {
104 		CLG_RANDOM,
105 		CLG_4_STATES,
106 		CLG_GILB_ELL,
107 	} loss_model;
108 
109 	/* Correlated Loss Generation models */
110 	struct clgstate {
111 		/* state of the Markov chain */
112 		u8 state;
113 
114 		/* 4-states and Gilbert-Elliot models */
115 		u32 a1;	/* p13 for 4-states or p for GE */
116 		u32 a2;	/* p31 for 4-states or r for GE */
117 		u32 a3;	/* p32 for 4-states or h for GE */
118 		u32 a4;	/* p14 for 4-states or 1-k for GE */
119 		u32 a5; /* p23 used only in 4-states */
120 	} clg;
121 
122 };
123 
124 /* Time stamp put into socket buffer control block
125  * Only valid when skbs are in our internal t(ime)fifo queue.
126  */
127 struct netem_skb_cb {
128 	psched_time_t	time_to_send;
129 };
130 
131 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
132 {
133 	qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
134 	return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
135 }
136 
137 /* init_crandom - initialize correlated random number generator
138  * Use entropy source for initial seed.
139  */
140 static void init_crandom(struct crndstate *state, unsigned long rho)
141 {
142 	state->rho = rho;
143 	state->last = net_random();
144 }
145 
146 /* get_crandom - correlated random number generator
147  * Next number depends on last value.
148  * rho is scaled to avoid floating point.
149  */
150 static u32 get_crandom(struct crndstate *state)
151 {
152 	u64 value, rho;
153 	unsigned long answer;
154 
155 	if (state->rho == 0)	/* no correlation */
156 		return net_random();
157 
158 	value = net_random();
159 	rho = (u64)state->rho + 1;
160 	answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
161 	state->last = answer;
162 	return answer;
163 }
164 
165 /* loss_4state - 4-state model loss generator
166  * Generates losses according to the 4-state Markov chain adopted in
167  * the GI (General and Intuitive) loss model.
168  */
169 static bool loss_4state(struct netem_sched_data *q)
170 {
171 	struct clgstate *clg = &q->clg;
172 	u32 rnd = net_random();
173 
174 	/*
175 	 * Makes a comparison between rnd and the transition
176 	 * probabilities outgoing from the current state, then decides the
177 	 * next state and if the next packet has to be transmitted or lost.
178 	 * The four states correspond to:
179 	 *   1 => successfully transmitted packets within a gap period
180 	 *   4 => isolated losses within a gap period
181 	 *   3 => lost packets within a burst period
182 	 *   2 => successfully transmitted packets within a burst period
183 	 */
184 	switch (clg->state) {
185 	case 1:
186 		if (rnd < clg->a4) {
187 			clg->state = 4;
188 			return true;
189 		} else if (clg->a4 < rnd && rnd < clg->a1) {
190 			clg->state = 3;
191 			return true;
192 		} else if (clg->a1 < rnd)
193 			clg->state = 1;
194 
195 		break;
196 	case 2:
197 		if (rnd < clg->a5) {
198 			clg->state = 3;
199 			return true;
200 		} else
201 			clg->state = 2;
202 
203 		break;
204 	case 3:
205 		if (rnd < clg->a3)
206 			clg->state = 2;
207 		else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
208 			clg->state = 1;
209 			return true;
210 		} else if (clg->a2 + clg->a3 < rnd) {
211 			clg->state = 3;
212 			return true;
213 		}
214 		break;
215 	case 4:
216 		clg->state = 1;
217 		break;
218 	}
219 
220 	return false;
221 }
222 
223 /* loss_gilb_ell - Gilbert-Elliot model loss generator
224  * Generates losses according to the Gilbert-Elliot loss model or
225  * its special cases  (Gilbert or Simple Gilbert)
226  *
227  * Makes a comparison between random number and the transition
228  * probabilities outgoing from the current state, then decides the
229  * next state. A second random number is extracted and the comparison
230  * with the loss probability of the current state decides if the next
231  * packet will be transmitted or lost.
232  */
233 static bool loss_gilb_ell(struct netem_sched_data *q)
234 {
235 	struct clgstate *clg = &q->clg;
236 
237 	switch (clg->state) {
238 	case 1:
239 		if (net_random() < clg->a1)
240 			clg->state = 2;
241 		if (net_random() < clg->a4)
242 			return true;
243 	case 2:
244 		if (net_random() < clg->a2)
245 			clg->state = 1;
246 		if (clg->a3 > net_random())
247 			return true;
248 	}
249 
250 	return false;
251 }
252 
253 static bool loss_event(struct netem_sched_data *q)
254 {
255 	switch (q->loss_model) {
256 	case CLG_RANDOM:
257 		/* Random packet drop 0 => none, ~0 => all */
258 		return q->loss && q->loss >= get_crandom(&q->loss_cor);
259 
260 	case CLG_4_STATES:
261 		/* 4state loss model algorithm (used also for GI model)
262 		* Extracts a value from the markov 4 state loss generator,
263 		* if it is 1 drops a packet and if needed writes the event in
264 		* the kernel logs
265 		*/
266 		return loss_4state(q);
267 
268 	case CLG_GILB_ELL:
269 		/* Gilbert-Elliot loss model algorithm
270 		* Extracts a value from the Gilbert-Elliot loss generator,
271 		* if it is 1 drops a packet and if needed writes the event in
272 		* the kernel logs
273 		*/
274 		return loss_gilb_ell(q);
275 	}
276 
277 	return false;	/* not reached */
278 }
279 
280 
281 /* tabledist - return a pseudo-randomly distributed value with mean mu and
282  * std deviation sigma.  Uses table lookup to approximate the desired
283  * distribution, and a uniformly-distributed pseudo-random source.
284  */
285 static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
286 				struct crndstate *state,
287 				const struct disttable *dist)
288 {
289 	psched_tdiff_t x;
290 	long t;
291 	u32 rnd;
292 
293 	if (sigma == 0)
294 		return mu;
295 
296 	rnd = get_crandom(state);
297 
298 	/* default uniform distribution */
299 	if (dist == NULL)
300 		return (rnd % (2*sigma)) - sigma + mu;
301 
302 	t = dist->table[rnd % dist->size];
303 	x = (sigma % NETEM_DIST_SCALE) * t;
304 	if (x >= 0)
305 		x += NETEM_DIST_SCALE/2;
306 	else
307 		x -= NETEM_DIST_SCALE/2;
308 
309 	return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
310 }
311 
312 static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q)
313 {
314 	u64 ticks;
315 
316 	len += q->packet_overhead;
317 
318 	if (q->cell_size) {
319 		u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
320 
321 		if (len > cells * q->cell_size)	/* extra cell needed for remainder */
322 			cells++;
323 		len = cells * (q->cell_size + q->cell_overhead);
324 	}
325 
326 	ticks = (u64)len * NSEC_PER_SEC;
327 
328 	do_div(ticks, q->rate);
329 	return PSCHED_NS2TICKS(ticks);
330 }
331 
332 static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
333 {
334 	struct sk_buff_head *list = &sch->q;
335 	psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
336 	struct sk_buff *skb;
337 
338 	if (likely(skb_queue_len(list) < sch->limit)) {
339 		skb = skb_peek_tail(list);
340 		/* Optimize for add at tail */
341 		if (likely(!skb || tnext >= netem_skb_cb(skb)->time_to_send))
342 			return qdisc_enqueue_tail(nskb, sch);
343 
344 		skb_queue_reverse_walk(list, skb) {
345 			if (tnext >= netem_skb_cb(skb)->time_to_send)
346 				break;
347 		}
348 
349 		__skb_queue_after(list, skb, nskb);
350 		sch->qstats.backlog += qdisc_pkt_len(nskb);
351 		return NET_XMIT_SUCCESS;
352 	}
353 
354 	return qdisc_reshape_fail(nskb, sch);
355 }
356 
357 /*
358  * Insert one skb into qdisc.
359  * Note: parent depends on return value to account for queue length.
360  * 	NET_XMIT_DROP: queue length didn't change.
361  *      NET_XMIT_SUCCESS: one skb was queued.
362  */
363 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
364 {
365 	struct netem_sched_data *q = qdisc_priv(sch);
366 	/* We don't fill cb now as skb_unshare() may invalidate it */
367 	struct netem_skb_cb *cb;
368 	struct sk_buff *skb2;
369 	int ret;
370 	int count = 1;
371 
372 	/* Random duplication */
373 	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
374 		++count;
375 
376 	/* Drop packet? */
377 	if (loss_event(q))
378 		--count;
379 
380 	if (count == 0) {
381 		sch->qstats.drops++;
382 		kfree_skb(skb);
383 		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
384 	}
385 
386 	skb_orphan(skb);
387 
388 	/*
389 	 * If we need to duplicate packet, then re-insert at top of the
390 	 * qdisc tree, since parent queuer expects that only one
391 	 * skb will be queued.
392 	 */
393 	if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
394 		struct Qdisc *rootq = qdisc_root(sch);
395 		u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
396 		q->duplicate = 0;
397 
398 		qdisc_enqueue_root(skb2, rootq);
399 		q->duplicate = dupsave;
400 	}
401 
402 	/*
403 	 * Randomized packet corruption.
404 	 * Make copy if needed since we are modifying
405 	 * If packet is going to be hardware checksummed, then
406 	 * do it now in software before we mangle it.
407 	 */
408 	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
409 		if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
410 		    (skb->ip_summed == CHECKSUM_PARTIAL &&
411 		     skb_checksum_help(skb)))
412 			return qdisc_drop(skb, sch);
413 
414 		skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
415 	}
416 
417 	cb = netem_skb_cb(skb);
418 	if (q->gap == 0 ||		/* not doing reordering */
419 	    q->counter < q->gap - 1 ||	/* inside last reordering gap */
420 	    q->reorder < get_crandom(&q->reorder_cor)) {
421 		psched_time_t now;
422 		psched_tdiff_t delay;
423 
424 		delay = tabledist(q->latency, q->jitter,
425 				  &q->delay_cor, q->delay_dist);
426 
427 		now = psched_get_time();
428 
429 		if (q->rate) {
430 			struct sk_buff_head *list = &sch->q;
431 
432 			delay += packet_len_2_sched_time(skb->len, q);
433 
434 			if (!skb_queue_empty(list)) {
435 				/*
436 				 * Last packet in queue is reference point (now).
437 				 * First packet in queue is already in flight,
438 				 * calculate this time bonus and substract
439 				 * from delay.
440 				 */
441 				delay -= now - netem_skb_cb(skb_peek(list))->time_to_send;
442 				now = netem_skb_cb(skb_peek_tail(list))->time_to_send;
443 			}
444 		}
445 
446 		cb->time_to_send = now + delay;
447 		++q->counter;
448 		ret = tfifo_enqueue(skb, sch);
449 	} else {
450 		/*
451 		 * Do re-ordering by putting one out of N packets at the front
452 		 * of the queue.
453 		 */
454 		cb->time_to_send = psched_get_time();
455 		q->counter = 0;
456 
457 		__skb_queue_head(&sch->q, skb);
458 		sch->qstats.backlog += qdisc_pkt_len(skb);
459 		sch->qstats.requeues++;
460 		ret = NET_XMIT_SUCCESS;
461 	}
462 
463 	if (ret != NET_XMIT_SUCCESS) {
464 		if (net_xmit_drop_count(ret)) {
465 			sch->qstats.drops++;
466 			return ret;
467 		}
468 	}
469 
470 	return NET_XMIT_SUCCESS;
471 }
472 
473 static unsigned int netem_drop(struct Qdisc *sch)
474 {
475 	struct netem_sched_data *q = qdisc_priv(sch);
476 	unsigned int len;
477 
478 	len = qdisc_queue_drop(sch);
479 	if (!len && q->qdisc && q->qdisc->ops->drop)
480 	    len = q->qdisc->ops->drop(q->qdisc);
481 	if (len)
482 		sch->qstats.drops++;
483 
484 	return len;
485 }
486 
487 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
488 {
489 	struct netem_sched_data *q = qdisc_priv(sch);
490 	struct sk_buff *skb;
491 
492 	if (qdisc_is_throttled(sch))
493 		return NULL;
494 
495 tfifo_dequeue:
496 	skb = qdisc_peek_head(sch);
497 	if (skb) {
498 		const struct netem_skb_cb *cb = netem_skb_cb(skb);
499 
500 		/* if more time remaining? */
501 		if (cb->time_to_send <= psched_get_time()) {
502 			__skb_unlink(skb, &sch->q);
503 			sch->qstats.backlog -= qdisc_pkt_len(skb);
504 
505 #ifdef CONFIG_NET_CLS_ACT
506 			/*
507 			 * If it's at ingress let's pretend the delay is
508 			 * from the network (tstamp will be updated).
509 			 */
510 			if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
511 				skb->tstamp.tv64 = 0;
512 #endif
513 
514 			if (q->qdisc) {
515 				int err = qdisc_enqueue(skb, q->qdisc);
516 
517 				if (unlikely(err != NET_XMIT_SUCCESS)) {
518 					if (net_xmit_drop_count(err)) {
519 						sch->qstats.drops++;
520 						qdisc_tree_decrease_qlen(sch, 1);
521 					}
522 				}
523 				goto tfifo_dequeue;
524 			}
525 deliver:
526 			qdisc_unthrottled(sch);
527 			qdisc_bstats_update(sch, skb);
528 			return skb;
529 		}
530 
531 		if (q->qdisc) {
532 			skb = q->qdisc->ops->dequeue(q->qdisc);
533 			if (skb)
534 				goto deliver;
535 		}
536 		qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);
537 	}
538 
539 	if (q->qdisc) {
540 		skb = q->qdisc->ops->dequeue(q->qdisc);
541 		if (skb)
542 			goto deliver;
543 	}
544 	return NULL;
545 }
546 
547 static void netem_reset(struct Qdisc *sch)
548 {
549 	struct netem_sched_data *q = qdisc_priv(sch);
550 
551 	qdisc_reset_queue(sch);
552 	if (q->qdisc)
553 		qdisc_reset(q->qdisc);
554 	qdisc_watchdog_cancel(&q->watchdog);
555 }
556 
557 static void dist_free(struct disttable *d)
558 {
559 	if (d) {
560 		if (is_vmalloc_addr(d))
561 			vfree(d);
562 		else
563 			kfree(d);
564 	}
565 }
566 
567 /*
568  * Distribution data is a variable size payload containing
569  * signed 16 bit values.
570  */
571 static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
572 {
573 	struct netem_sched_data *q = qdisc_priv(sch);
574 	size_t n = nla_len(attr)/sizeof(__s16);
575 	const __s16 *data = nla_data(attr);
576 	spinlock_t *root_lock;
577 	struct disttable *d;
578 	int i;
579 	size_t s;
580 
581 	if (n > NETEM_DIST_MAX)
582 		return -EINVAL;
583 
584 	s = sizeof(struct disttable) + n * sizeof(s16);
585 	d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN);
586 	if (!d)
587 		d = vmalloc(s);
588 	if (!d)
589 		return -ENOMEM;
590 
591 	d->size = n;
592 	for (i = 0; i < n; i++)
593 		d->table[i] = data[i];
594 
595 	root_lock = qdisc_root_sleeping_lock(sch);
596 
597 	spin_lock_bh(root_lock);
598 	swap(q->delay_dist, d);
599 	spin_unlock_bh(root_lock);
600 
601 	dist_free(d);
602 	return 0;
603 }
604 
605 static void get_correlation(struct Qdisc *sch, const struct nlattr *attr)
606 {
607 	struct netem_sched_data *q = qdisc_priv(sch);
608 	const struct tc_netem_corr *c = nla_data(attr);
609 
610 	init_crandom(&q->delay_cor, c->delay_corr);
611 	init_crandom(&q->loss_cor, c->loss_corr);
612 	init_crandom(&q->dup_cor, c->dup_corr);
613 }
614 
615 static void get_reorder(struct Qdisc *sch, const struct nlattr *attr)
616 {
617 	struct netem_sched_data *q = qdisc_priv(sch);
618 	const struct tc_netem_reorder *r = nla_data(attr);
619 
620 	q->reorder = r->probability;
621 	init_crandom(&q->reorder_cor, r->correlation);
622 }
623 
624 static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr)
625 {
626 	struct netem_sched_data *q = qdisc_priv(sch);
627 	const struct tc_netem_corrupt *r = nla_data(attr);
628 
629 	q->corrupt = r->probability;
630 	init_crandom(&q->corrupt_cor, r->correlation);
631 }
632 
633 static void get_rate(struct Qdisc *sch, const struct nlattr *attr)
634 {
635 	struct netem_sched_data *q = qdisc_priv(sch);
636 	const struct tc_netem_rate *r = nla_data(attr);
637 
638 	q->rate = r->rate;
639 	q->packet_overhead = r->packet_overhead;
640 	q->cell_size = r->cell_size;
641 	if (q->cell_size)
642 		q->cell_size_reciprocal = reciprocal_value(q->cell_size);
643 	q->cell_overhead = r->cell_overhead;
644 }
645 
646 static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
647 {
648 	struct netem_sched_data *q = qdisc_priv(sch);
649 	const struct nlattr *la;
650 	int rem;
651 
652 	nla_for_each_nested(la, attr, rem) {
653 		u16 type = nla_type(la);
654 
655 		switch(type) {
656 		case NETEM_LOSS_GI: {
657 			const struct tc_netem_gimodel *gi = nla_data(la);
658 
659 			if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
660 				pr_info("netem: incorrect gi model size\n");
661 				return -EINVAL;
662 			}
663 
664 			q->loss_model = CLG_4_STATES;
665 
666 			q->clg.state = 1;
667 			q->clg.a1 = gi->p13;
668 			q->clg.a2 = gi->p31;
669 			q->clg.a3 = gi->p32;
670 			q->clg.a4 = gi->p14;
671 			q->clg.a5 = gi->p23;
672 			break;
673 		}
674 
675 		case NETEM_LOSS_GE: {
676 			const struct tc_netem_gemodel *ge = nla_data(la);
677 
678 			if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
679 				pr_info("netem: incorrect ge model size\n");
680 				return -EINVAL;
681 			}
682 
683 			q->loss_model = CLG_GILB_ELL;
684 			q->clg.state = 1;
685 			q->clg.a1 = ge->p;
686 			q->clg.a2 = ge->r;
687 			q->clg.a3 = ge->h;
688 			q->clg.a4 = ge->k1;
689 			break;
690 		}
691 
692 		default:
693 			pr_info("netem: unknown loss type %u\n", type);
694 			return -EINVAL;
695 		}
696 	}
697 
698 	return 0;
699 }
700 
701 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
702 	[TCA_NETEM_CORR]	= { .len = sizeof(struct tc_netem_corr) },
703 	[TCA_NETEM_REORDER]	= { .len = sizeof(struct tc_netem_reorder) },
704 	[TCA_NETEM_CORRUPT]	= { .len = sizeof(struct tc_netem_corrupt) },
705 	[TCA_NETEM_RATE]	= { .len = sizeof(struct tc_netem_rate) },
706 	[TCA_NETEM_LOSS]	= { .type = NLA_NESTED },
707 };
708 
709 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
710 		      const struct nla_policy *policy, int len)
711 {
712 	int nested_len = nla_len(nla) - NLA_ALIGN(len);
713 
714 	if (nested_len < 0) {
715 		pr_info("netem: invalid attributes len %d\n", nested_len);
716 		return -EINVAL;
717 	}
718 
719 	if (nested_len >= nla_attr_size(0))
720 		return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
721 				 nested_len, policy);
722 
723 	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
724 	return 0;
725 }
726 
727 /* Parse netlink message to set options */
728 static int netem_change(struct Qdisc *sch, struct nlattr *opt)
729 {
730 	struct netem_sched_data *q = qdisc_priv(sch);
731 	struct nlattr *tb[TCA_NETEM_MAX + 1];
732 	struct tc_netem_qopt *qopt;
733 	int ret;
734 
735 	if (opt == NULL)
736 		return -EINVAL;
737 
738 	qopt = nla_data(opt);
739 	ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
740 	if (ret < 0)
741 		return ret;
742 
743 	sch->limit = qopt->limit;
744 
745 	q->latency = qopt->latency;
746 	q->jitter = qopt->jitter;
747 	q->limit = qopt->limit;
748 	q->gap = qopt->gap;
749 	q->counter = 0;
750 	q->loss = qopt->loss;
751 	q->duplicate = qopt->duplicate;
752 
753 	/* for compatibility with earlier versions.
754 	 * if gap is set, need to assume 100% probability
755 	 */
756 	if (q->gap)
757 		q->reorder = ~0;
758 
759 	if (tb[TCA_NETEM_CORR])
760 		get_correlation(sch, tb[TCA_NETEM_CORR]);
761 
762 	if (tb[TCA_NETEM_DELAY_DIST]) {
763 		ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
764 		if (ret)
765 			return ret;
766 	}
767 
768 	if (tb[TCA_NETEM_REORDER])
769 		get_reorder(sch, tb[TCA_NETEM_REORDER]);
770 
771 	if (tb[TCA_NETEM_CORRUPT])
772 		get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
773 
774 	if (tb[TCA_NETEM_RATE])
775 		get_rate(sch, tb[TCA_NETEM_RATE]);
776 
777 	q->loss_model = CLG_RANDOM;
778 	if (tb[TCA_NETEM_LOSS])
779 		ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]);
780 
781 	return ret;
782 }
783 
784 static int netem_init(struct Qdisc *sch, struct nlattr *opt)
785 {
786 	struct netem_sched_data *q = qdisc_priv(sch);
787 	int ret;
788 
789 	if (!opt)
790 		return -EINVAL;
791 
792 	qdisc_watchdog_init(&q->watchdog, sch);
793 
794 	q->loss_model = CLG_RANDOM;
795 	ret = netem_change(sch, opt);
796 	if (ret)
797 		pr_info("netem: change failed\n");
798 	return ret;
799 }
800 
801 static void netem_destroy(struct Qdisc *sch)
802 {
803 	struct netem_sched_data *q = qdisc_priv(sch);
804 
805 	qdisc_watchdog_cancel(&q->watchdog);
806 	if (q->qdisc)
807 		qdisc_destroy(q->qdisc);
808 	dist_free(q->delay_dist);
809 }
810 
811 static int dump_loss_model(const struct netem_sched_data *q,
812 			   struct sk_buff *skb)
813 {
814 	struct nlattr *nest;
815 
816 	nest = nla_nest_start(skb, TCA_NETEM_LOSS);
817 	if (nest == NULL)
818 		goto nla_put_failure;
819 
820 	switch (q->loss_model) {
821 	case CLG_RANDOM:
822 		/* legacy loss model */
823 		nla_nest_cancel(skb, nest);
824 		return 0;	/* no data */
825 
826 	case CLG_4_STATES: {
827 		struct tc_netem_gimodel gi = {
828 			.p13 = q->clg.a1,
829 			.p31 = q->clg.a2,
830 			.p32 = q->clg.a3,
831 			.p14 = q->clg.a4,
832 			.p23 = q->clg.a5,
833 		};
834 
835 		NLA_PUT(skb, NETEM_LOSS_GI, sizeof(gi), &gi);
836 		break;
837 	}
838 	case CLG_GILB_ELL: {
839 		struct tc_netem_gemodel ge = {
840 			.p = q->clg.a1,
841 			.r = q->clg.a2,
842 			.h = q->clg.a3,
843 			.k1 = q->clg.a4,
844 		};
845 
846 		NLA_PUT(skb, NETEM_LOSS_GE, sizeof(ge), &ge);
847 		break;
848 	}
849 	}
850 
851 	nla_nest_end(skb, nest);
852 	return 0;
853 
854 nla_put_failure:
855 	nla_nest_cancel(skb, nest);
856 	return -1;
857 }
858 
859 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
860 {
861 	const struct netem_sched_data *q = qdisc_priv(sch);
862 	struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
863 	struct tc_netem_qopt qopt;
864 	struct tc_netem_corr cor;
865 	struct tc_netem_reorder reorder;
866 	struct tc_netem_corrupt corrupt;
867 	struct tc_netem_rate rate;
868 
869 	qopt.latency = q->latency;
870 	qopt.jitter = q->jitter;
871 	qopt.limit = q->limit;
872 	qopt.loss = q->loss;
873 	qopt.gap = q->gap;
874 	qopt.duplicate = q->duplicate;
875 	NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
876 
877 	cor.delay_corr = q->delay_cor.rho;
878 	cor.loss_corr = q->loss_cor.rho;
879 	cor.dup_corr = q->dup_cor.rho;
880 	NLA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
881 
882 	reorder.probability = q->reorder;
883 	reorder.correlation = q->reorder_cor.rho;
884 	NLA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder);
885 
886 	corrupt.probability = q->corrupt;
887 	corrupt.correlation = q->corrupt_cor.rho;
888 	NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
889 
890 	rate.rate = q->rate;
891 	rate.packet_overhead = q->packet_overhead;
892 	rate.cell_size = q->cell_size;
893 	rate.cell_overhead = q->cell_overhead;
894 	NLA_PUT(skb, TCA_NETEM_RATE, sizeof(rate), &rate);
895 
896 	if (dump_loss_model(q, skb) != 0)
897 		goto nla_put_failure;
898 
899 	return nla_nest_end(skb, nla);
900 
901 nla_put_failure:
902 	nlmsg_trim(skb, nla);
903 	return -1;
904 }
905 
906 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
907 			  struct sk_buff *skb, struct tcmsg *tcm)
908 {
909 	struct netem_sched_data *q = qdisc_priv(sch);
910 
911 	if (cl != 1 || !q->qdisc) 	/* only one class */
912 		return -ENOENT;
913 
914 	tcm->tcm_handle |= TC_H_MIN(1);
915 	tcm->tcm_info = q->qdisc->handle;
916 
917 	return 0;
918 }
919 
920 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
921 		     struct Qdisc **old)
922 {
923 	struct netem_sched_data *q = qdisc_priv(sch);
924 
925 	sch_tree_lock(sch);
926 	*old = q->qdisc;
927 	q->qdisc = new;
928 	if (*old) {
929 		qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
930 		qdisc_reset(*old);
931 	}
932 	sch_tree_unlock(sch);
933 
934 	return 0;
935 }
936 
937 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
938 {
939 	struct netem_sched_data *q = qdisc_priv(sch);
940 	return q->qdisc;
941 }
942 
943 static unsigned long netem_get(struct Qdisc *sch, u32 classid)
944 {
945 	return 1;
946 }
947 
948 static void netem_put(struct Qdisc *sch, unsigned long arg)
949 {
950 }
951 
952 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
953 {
954 	if (!walker->stop) {
955 		if (walker->count >= walker->skip)
956 			if (walker->fn(sch, 1, walker) < 0) {
957 				walker->stop = 1;
958 				return;
959 			}
960 		walker->count++;
961 	}
962 }
963 
964 static const struct Qdisc_class_ops netem_class_ops = {
965 	.graft		=	netem_graft,
966 	.leaf		=	netem_leaf,
967 	.get		=	netem_get,
968 	.put		=	netem_put,
969 	.walk		=	netem_walk,
970 	.dump		=	netem_dump_class,
971 };
972 
973 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
974 	.id		=	"netem",
975 	.cl_ops		=	&netem_class_ops,
976 	.priv_size	=	sizeof(struct netem_sched_data),
977 	.enqueue	=	netem_enqueue,
978 	.dequeue	=	netem_dequeue,
979 	.peek		=	qdisc_peek_dequeued,
980 	.drop		=	netem_drop,
981 	.init		=	netem_init,
982 	.reset		=	netem_reset,
983 	.destroy	=	netem_destroy,
984 	.change		=	netem_change,
985 	.dump		=	netem_dump,
986 	.owner		=	THIS_MODULE,
987 };
988 
989 
990 static int __init netem_module_init(void)
991 {
992 	pr_info("netem: version " VERSION "\n");
993 	return register_qdisc(&netem_qdisc_ops);
994 }
995 static void __exit netem_module_exit(void)
996 {
997 	unregister_qdisc(&netem_qdisc_ops);
998 }
999 module_init(netem_module_init)
1000 module_exit(netem_module_exit)
1001 MODULE_LICENSE("GPL");
1002