xref: /linux/net/sched/sch_netem.c (revision b2d0f5d5dc53532e6f07bc546a476a55ebdfe0f3)
1 /*
2  * net/sched/sch_netem.c	Network emulator
3  *
4  * 		This program is free software; you can redistribute it and/or
5  * 		modify it under the terms of the GNU General Public License
6  * 		as published by the Free Software Foundation; either version
7  * 		2 of the License.
8  *
9  *  		Many of the algorithms and ideas for this came from
10  *		NIST Net which is not copyrighted.
11  *
12  * Authors:	Stephen Hemminger <shemminger@osdl.org>
13  *		Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
14  */
15 
16 #include <linux/mm.h>
17 #include <linux/module.h>
18 #include <linux/slab.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/errno.h>
22 #include <linux/skbuff.h>
23 #include <linux/vmalloc.h>
24 #include <linux/rtnetlink.h>
25 #include <linux/reciprocal_div.h>
26 #include <linux/rbtree.h>
27 
28 #include <net/netlink.h>
29 #include <net/pkt_sched.h>
30 #include <net/inet_ecn.h>
31 
32 #define VERSION "1.3"
33 
34 /*	Network Emulation Queuing algorithm.
35 	====================================
36 
37 	Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
38 		 Network Emulation Tool
39 		 [2] Luigi Rizzo, DummyNet for FreeBSD
40 
41 	 ----------------------------------------------------------------
42 
43 	 This started out as a simple way to delay outgoing packets to
44 	 test TCP but has grown to include most of the functionality
45 	 of a full blown network emulator like NISTnet. It can delay
46 	 packets and add random jitter (and correlation). The random
47 	 distribution can be loaded from a table as well to provide
48 	 normal, Pareto, or experimental curves. Packet loss,
49 	 duplication, and reordering can also be emulated.
50 
51 	 This qdisc does not do classification that can be handled in
52 	 layering other disciplines.  It does not need to do bandwidth
53 	 control either since that can be handled by using token
54 	 bucket or other rate control.
55 
56      Correlated Loss Generator models
57 
58 	Added generation of correlated loss according to the
59 	"Gilbert-Elliot" model, a 4-state markov model.
60 
61 	References:
62 	[1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
63 	[2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
64 	and intuitive loss model for packet networks and its implementation
65 	in the Netem module in the Linux kernel", available in [1]
66 
67 	Authors: Stefano Salsano <stefano.salsano at uniroma2.it
68 		 Fabio Ludovici <fabio.ludovici at yahoo.it>
69 */
70 
71 struct netem_sched_data {
72 	/* internal t(ime)fifo qdisc uses t_root and sch->limit */
73 	struct rb_root t_root;
74 
75 	/* optional qdisc for classful handling (NULL at netem init) */
76 	struct Qdisc	*qdisc;
77 
78 	struct qdisc_watchdog watchdog;
79 
80 	psched_tdiff_t latency;
81 	psched_tdiff_t jitter;
82 
83 	u32 loss;
84 	u32 ecn;
85 	u32 limit;
86 	u32 counter;
87 	u32 gap;
88 	u32 duplicate;
89 	u32 reorder;
90 	u32 corrupt;
91 	u64 rate;
92 	s32 packet_overhead;
93 	u32 cell_size;
94 	struct reciprocal_value cell_size_reciprocal;
95 	s32 cell_overhead;
96 
97 	struct crndstate {
98 		u32 last;
99 		u32 rho;
100 	} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
101 
102 	struct disttable {
103 		u32  size;
104 		s16 table[0];
105 	} *delay_dist;
106 
107 	enum  {
108 		CLG_RANDOM,
109 		CLG_4_STATES,
110 		CLG_GILB_ELL,
111 	} loss_model;
112 
113 	enum {
114 		TX_IN_GAP_PERIOD = 1,
115 		TX_IN_BURST_PERIOD,
116 		LOST_IN_GAP_PERIOD,
117 		LOST_IN_BURST_PERIOD,
118 	} _4_state_model;
119 
120 	enum {
121 		GOOD_STATE = 1,
122 		BAD_STATE,
123 	} GE_state_model;
124 
125 	/* Correlated Loss Generation models */
126 	struct clgstate {
127 		/* state of the Markov chain */
128 		u8 state;
129 
130 		/* 4-states and Gilbert-Elliot models */
131 		u32 a1;	/* p13 for 4-states or p for GE */
132 		u32 a2;	/* p31 for 4-states or r for GE */
133 		u32 a3;	/* p32 for 4-states or h for GE */
134 		u32 a4;	/* p14 for 4-states or 1-k for GE */
135 		u32 a5; /* p23 used only in 4-states */
136 	} clg;
137 
138 };
139 
140 /* Time stamp put into socket buffer control block
141  * Only valid when skbs are in our internal t(ime)fifo queue.
142  *
143  * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp,
144  * and skb->next & skb->prev are scratch space for a qdisc,
145  * we save skb->tstamp value in skb->cb[] before destroying it.
146  */
147 struct netem_skb_cb {
148 	psched_time_t	time_to_send;
149 };
150 
151 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
152 {
153 	/* we assume we can use skb next/prev/tstamp as storage for rb_node */
154 	qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
155 	return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
156 }
157 
158 /* init_crandom - initialize correlated random number generator
159  * Use entropy source for initial seed.
160  */
161 static void init_crandom(struct crndstate *state, unsigned long rho)
162 {
163 	state->rho = rho;
164 	state->last = prandom_u32();
165 }
166 
167 /* get_crandom - correlated random number generator
168  * Next number depends on last value.
169  * rho is scaled to avoid floating point.
170  */
171 static u32 get_crandom(struct crndstate *state)
172 {
173 	u64 value, rho;
174 	unsigned long answer;
175 
176 	if (state->rho == 0)	/* no correlation */
177 		return prandom_u32();
178 
179 	value = prandom_u32();
180 	rho = (u64)state->rho + 1;
181 	answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
182 	state->last = answer;
183 	return answer;
184 }
185 
186 /* loss_4state - 4-state model loss generator
187  * Generates losses according to the 4-state Markov chain adopted in
188  * the GI (General and Intuitive) loss model.
189  */
190 static bool loss_4state(struct netem_sched_data *q)
191 {
192 	struct clgstate *clg = &q->clg;
193 	u32 rnd = prandom_u32();
194 
195 	/*
196 	 * Makes a comparison between rnd and the transition
197 	 * probabilities outgoing from the current state, then decides the
198 	 * next state and if the next packet has to be transmitted or lost.
199 	 * The four states correspond to:
200 	 *   TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
201 	 *   LOST_IN_BURST_PERIOD => isolated losses within a gap period
202 	 *   LOST_IN_GAP_PERIOD => lost packets within a burst period
203 	 *   TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period
204 	 */
205 	switch (clg->state) {
206 	case TX_IN_GAP_PERIOD:
207 		if (rnd < clg->a4) {
208 			clg->state = LOST_IN_BURST_PERIOD;
209 			return true;
210 		} else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
211 			clg->state = LOST_IN_GAP_PERIOD;
212 			return true;
213 		} else if (clg->a1 + clg->a4 < rnd) {
214 			clg->state = TX_IN_GAP_PERIOD;
215 		}
216 
217 		break;
218 	case TX_IN_BURST_PERIOD:
219 		if (rnd < clg->a5) {
220 			clg->state = LOST_IN_GAP_PERIOD;
221 			return true;
222 		} else {
223 			clg->state = TX_IN_BURST_PERIOD;
224 		}
225 
226 		break;
227 	case LOST_IN_GAP_PERIOD:
228 		if (rnd < clg->a3)
229 			clg->state = TX_IN_BURST_PERIOD;
230 		else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
231 			clg->state = TX_IN_GAP_PERIOD;
232 		} else if (clg->a2 + clg->a3 < rnd) {
233 			clg->state = LOST_IN_GAP_PERIOD;
234 			return true;
235 		}
236 		break;
237 	case LOST_IN_BURST_PERIOD:
238 		clg->state = TX_IN_GAP_PERIOD;
239 		break;
240 	}
241 
242 	return false;
243 }
244 
245 /* loss_gilb_ell - Gilbert-Elliot model loss generator
246  * Generates losses according to the Gilbert-Elliot loss model or
247  * its special cases  (Gilbert or Simple Gilbert)
248  *
249  * Makes a comparison between random number and the transition
250  * probabilities outgoing from the current state, then decides the
251  * next state. A second random number is extracted and the comparison
252  * with the loss probability of the current state decides if the next
253  * packet will be transmitted or lost.
254  */
255 static bool loss_gilb_ell(struct netem_sched_data *q)
256 {
257 	struct clgstate *clg = &q->clg;
258 
259 	switch (clg->state) {
260 	case GOOD_STATE:
261 		if (prandom_u32() < clg->a1)
262 			clg->state = BAD_STATE;
263 		if (prandom_u32() < clg->a4)
264 			return true;
265 		break;
266 	case BAD_STATE:
267 		if (prandom_u32() < clg->a2)
268 			clg->state = GOOD_STATE;
269 		if (prandom_u32() > clg->a3)
270 			return true;
271 	}
272 
273 	return false;
274 }
275 
276 static bool loss_event(struct netem_sched_data *q)
277 {
278 	switch (q->loss_model) {
279 	case CLG_RANDOM:
280 		/* Random packet drop 0 => none, ~0 => all */
281 		return q->loss && q->loss >= get_crandom(&q->loss_cor);
282 
283 	case CLG_4_STATES:
284 		/* 4state loss model algorithm (used also for GI model)
285 		* Extracts a value from the markov 4 state loss generator,
286 		* if it is 1 drops a packet and if needed writes the event in
287 		* the kernel logs
288 		*/
289 		return loss_4state(q);
290 
291 	case CLG_GILB_ELL:
292 		/* Gilbert-Elliot loss model algorithm
293 		* Extracts a value from the Gilbert-Elliot loss generator,
294 		* if it is 1 drops a packet and if needed writes the event in
295 		* the kernel logs
296 		*/
297 		return loss_gilb_ell(q);
298 	}
299 
300 	return false;	/* not reached */
301 }
302 
303 
304 /* tabledist - return a pseudo-randomly distributed value with mean mu and
305  * std deviation sigma.  Uses table lookup to approximate the desired
306  * distribution, and a uniformly-distributed pseudo-random source.
307  */
308 static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
309 				struct crndstate *state,
310 				const struct disttable *dist)
311 {
312 	psched_tdiff_t x;
313 	long t;
314 	u32 rnd;
315 
316 	if (sigma == 0)
317 		return mu;
318 
319 	rnd = get_crandom(state);
320 
321 	/* default uniform distribution */
322 	if (dist == NULL)
323 		return (rnd % (2*sigma)) - sigma + mu;
324 
325 	t = dist->table[rnd % dist->size];
326 	x = (sigma % NETEM_DIST_SCALE) * t;
327 	if (x >= 0)
328 		x += NETEM_DIST_SCALE/2;
329 	else
330 		x -= NETEM_DIST_SCALE/2;
331 
332 	return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
333 }
334 
335 static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q)
336 {
337 	u64 ticks;
338 
339 	len += q->packet_overhead;
340 
341 	if (q->cell_size) {
342 		u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
343 
344 		if (len > cells * q->cell_size)	/* extra cell needed for remainder */
345 			cells++;
346 		len = cells * (q->cell_size + q->cell_overhead);
347 	}
348 
349 	ticks = (u64)len * NSEC_PER_SEC;
350 
351 	do_div(ticks, q->rate);
352 	return PSCHED_NS2TICKS(ticks);
353 }
354 
355 static void tfifo_reset(struct Qdisc *sch)
356 {
357 	struct netem_sched_data *q = qdisc_priv(sch);
358 	struct rb_node *p = rb_first(&q->t_root);
359 
360 	while (p) {
361 		struct sk_buff *skb = rb_to_skb(p);
362 
363 		p = rb_next(p);
364 		rb_erase(&skb->rbnode, &q->t_root);
365 		rtnl_kfree_skbs(skb, skb);
366 	}
367 }
368 
369 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
370 {
371 	struct netem_sched_data *q = qdisc_priv(sch);
372 	psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
373 	struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
374 
375 	while (*p) {
376 		struct sk_buff *skb;
377 
378 		parent = *p;
379 		skb = rb_to_skb(parent);
380 		if (tnext >= netem_skb_cb(skb)->time_to_send)
381 			p = &parent->rb_right;
382 		else
383 			p = &parent->rb_left;
384 	}
385 	rb_link_node(&nskb->rbnode, parent, p);
386 	rb_insert_color(&nskb->rbnode, &q->t_root);
387 	sch->q.qlen++;
388 }
389 
390 /* netem can't properly corrupt a megapacket (like we get from GSO), so instead
391  * when we statistically choose to corrupt one, we instead segment it, returning
392  * the first packet to be corrupted, and re-enqueue the remaining frames
393  */
394 static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
395 				     struct sk_buff **to_free)
396 {
397 	struct sk_buff *segs;
398 	netdev_features_t features = netif_skb_features(skb);
399 
400 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
401 
402 	if (IS_ERR_OR_NULL(segs)) {
403 		qdisc_drop(skb, sch, to_free);
404 		return NULL;
405 	}
406 	consume_skb(skb);
407 	return segs;
408 }
409 
410 static void netem_enqueue_skb_head(struct qdisc_skb_head *qh, struct sk_buff *skb)
411 {
412 	skb->next = qh->head;
413 
414 	if (!qh->head)
415 		qh->tail = skb;
416 	qh->head = skb;
417 	qh->qlen++;
418 }
419 
420 /*
421  * Insert one skb into qdisc.
422  * Note: parent depends on return value to account for queue length.
423  * 	NET_XMIT_DROP: queue length didn't change.
424  *      NET_XMIT_SUCCESS: one skb was queued.
425  */
426 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
427 			 struct sk_buff **to_free)
428 {
429 	struct netem_sched_data *q = qdisc_priv(sch);
430 	/* We don't fill cb now as skb_unshare() may invalidate it */
431 	struct netem_skb_cb *cb;
432 	struct sk_buff *skb2;
433 	struct sk_buff *segs = NULL;
434 	unsigned int len = 0, last_len, prev_len = qdisc_pkt_len(skb);
435 	int nb = 0;
436 	int count = 1;
437 	int rc = NET_XMIT_SUCCESS;
438 
439 	/* Random duplication */
440 	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
441 		++count;
442 
443 	/* Drop packet? */
444 	if (loss_event(q)) {
445 		if (q->ecn && INET_ECN_set_ce(skb))
446 			qdisc_qstats_drop(sch); /* mark packet */
447 		else
448 			--count;
449 	}
450 	if (count == 0) {
451 		qdisc_qstats_drop(sch);
452 		__qdisc_drop(skb, to_free);
453 		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
454 	}
455 
456 	/* If a delay is expected, orphan the skb. (orphaning usually takes
457 	 * place at TX completion time, so _before_ the link transit delay)
458 	 */
459 	if (q->latency || q->jitter || q->rate)
460 		skb_orphan_partial(skb);
461 
462 	/*
463 	 * If we need to duplicate packet, then re-insert at top of the
464 	 * qdisc tree, since parent queuer expects that only one
465 	 * skb will be queued.
466 	 */
467 	if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
468 		struct Qdisc *rootq = qdisc_root(sch);
469 		u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
470 
471 		q->duplicate = 0;
472 		rootq->enqueue(skb2, rootq, to_free);
473 		q->duplicate = dupsave;
474 	}
475 
476 	/*
477 	 * Randomized packet corruption.
478 	 * Make copy if needed since we are modifying
479 	 * If packet is going to be hardware checksummed, then
480 	 * do it now in software before we mangle it.
481 	 */
482 	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
483 		if (skb_is_gso(skb)) {
484 			segs = netem_segment(skb, sch, to_free);
485 			if (!segs)
486 				return NET_XMIT_DROP;
487 		} else {
488 			segs = skb;
489 		}
490 
491 		skb = segs;
492 		segs = segs->next;
493 
494 		skb = skb_unshare(skb, GFP_ATOMIC);
495 		if (unlikely(!skb)) {
496 			qdisc_qstats_drop(sch);
497 			goto finish_segs;
498 		}
499 		if (skb->ip_summed == CHECKSUM_PARTIAL &&
500 		    skb_checksum_help(skb)) {
501 			qdisc_drop(skb, sch, to_free);
502 			goto finish_segs;
503 		}
504 
505 		skb->data[prandom_u32() % skb_headlen(skb)] ^=
506 			1<<(prandom_u32() % 8);
507 	}
508 
509 	if (unlikely(sch->q.qlen >= sch->limit))
510 		return qdisc_drop(skb, sch, to_free);
511 
512 	qdisc_qstats_backlog_inc(sch, skb);
513 
514 	cb = netem_skb_cb(skb);
515 	if (q->gap == 0 ||		/* not doing reordering */
516 	    q->counter < q->gap - 1 ||	/* inside last reordering gap */
517 	    q->reorder < get_crandom(&q->reorder_cor)) {
518 		psched_time_t now;
519 		psched_tdiff_t delay;
520 
521 		delay = tabledist(q->latency, q->jitter,
522 				  &q->delay_cor, q->delay_dist);
523 
524 		now = psched_get_time();
525 
526 		if (q->rate) {
527 			struct netem_skb_cb *last = NULL;
528 
529 			if (sch->q.tail)
530 				last = netem_skb_cb(sch->q.tail);
531 			if (q->t_root.rb_node) {
532 				struct sk_buff *t_skb;
533 				struct netem_skb_cb *t_last;
534 
535 				t_skb = skb_rb_last(&q->t_root);
536 				t_last = netem_skb_cb(t_skb);
537 				if (!last ||
538 				    t_last->time_to_send > last->time_to_send) {
539 					last = t_last;
540 				}
541 			}
542 
543 			if (last) {
544 				/*
545 				 * Last packet in queue is reference point (now),
546 				 * calculate this time bonus and subtract
547 				 * from delay.
548 				 */
549 				delay -= last->time_to_send - now;
550 				delay = max_t(psched_tdiff_t, 0, delay);
551 				now = last->time_to_send;
552 			}
553 
554 			delay += packet_len_2_sched_time(qdisc_pkt_len(skb), q);
555 		}
556 
557 		cb->time_to_send = now + delay;
558 		++q->counter;
559 		tfifo_enqueue(skb, sch);
560 	} else {
561 		/*
562 		 * Do re-ordering by putting one out of N packets at the front
563 		 * of the queue.
564 		 */
565 		cb->time_to_send = psched_get_time();
566 		q->counter = 0;
567 
568 		netem_enqueue_skb_head(&sch->q, skb);
569 		sch->qstats.requeues++;
570 	}
571 
572 finish_segs:
573 	if (segs) {
574 		while (segs) {
575 			skb2 = segs->next;
576 			segs->next = NULL;
577 			qdisc_skb_cb(segs)->pkt_len = segs->len;
578 			last_len = segs->len;
579 			rc = qdisc_enqueue(segs, sch, to_free);
580 			if (rc != NET_XMIT_SUCCESS) {
581 				if (net_xmit_drop_count(rc))
582 					qdisc_qstats_drop(sch);
583 			} else {
584 				nb++;
585 				len += last_len;
586 			}
587 			segs = skb2;
588 		}
589 		sch->q.qlen += nb;
590 		if (nb > 1)
591 			qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
592 	}
593 	return NET_XMIT_SUCCESS;
594 }
595 
596 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
597 {
598 	struct netem_sched_data *q = qdisc_priv(sch);
599 	struct sk_buff *skb;
600 	struct rb_node *p;
601 
602 tfifo_dequeue:
603 	skb = __qdisc_dequeue_head(&sch->q);
604 	if (skb) {
605 		qdisc_qstats_backlog_dec(sch, skb);
606 deliver:
607 		qdisc_bstats_update(sch, skb);
608 		return skb;
609 	}
610 	p = rb_first(&q->t_root);
611 	if (p) {
612 		psched_time_t time_to_send;
613 
614 		skb = rb_to_skb(p);
615 
616 		/* if more time remaining? */
617 		time_to_send = netem_skb_cb(skb)->time_to_send;
618 		if (time_to_send <= psched_get_time()) {
619 			rb_erase(p, &q->t_root);
620 
621 			sch->q.qlen--;
622 			qdisc_qstats_backlog_dec(sch, skb);
623 			skb->next = NULL;
624 			skb->prev = NULL;
625 			/* skb->dev shares skb->rbnode area,
626 			 * we need to restore its value.
627 			 */
628 			skb->dev = qdisc_dev(sch);
629 
630 #ifdef CONFIG_NET_CLS_ACT
631 			/*
632 			 * If it's at ingress let's pretend the delay is
633 			 * from the network (tstamp will be updated).
634 			 */
635 			if (skb->tc_redirected && skb->tc_from_ingress)
636 				skb->tstamp = 0;
637 #endif
638 
639 			if (q->qdisc) {
640 				unsigned int pkt_len = qdisc_pkt_len(skb);
641 				struct sk_buff *to_free = NULL;
642 				int err;
643 
644 				err = qdisc_enqueue(skb, q->qdisc, &to_free);
645 				kfree_skb_list(to_free);
646 				if (err != NET_XMIT_SUCCESS &&
647 				    net_xmit_drop_count(err)) {
648 					qdisc_qstats_drop(sch);
649 					qdisc_tree_reduce_backlog(sch, 1,
650 								  pkt_len);
651 				}
652 				goto tfifo_dequeue;
653 			}
654 			goto deliver;
655 		}
656 
657 		if (q->qdisc) {
658 			skb = q->qdisc->ops->dequeue(q->qdisc);
659 			if (skb)
660 				goto deliver;
661 		}
662 		qdisc_watchdog_schedule(&q->watchdog, time_to_send);
663 	}
664 
665 	if (q->qdisc) {
666 		skb = q->qdisc->ops->dequeue(q->qdisc);
667 		if (skb)
668 			goto deliver;
669 	}
670 	return NULL;
671 }
672 
673 static void netem_reset(struct Qdisc *sch)
674 {
675 	struct netem_sched_data *q = qdisc_priv(sch);
676 
677 	qdisc_reset_queue(sch);
678 	tfifo_reset(sch);
679 	if (q->qdisc)
680 		qdisc_reset(q->qdisc);
681 	qdisc_watchdog_cancel(&q->watchdog);
682 }
683 
684 static void dist_free(struct disttable *d)
685 {
686 	kvfree(d);
687 }
688 
689 /*
690  * Distribution data is a variable size payload containing
691  * signed 16 bit values.
692  */
693 static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
694 {
695 	struct netem_sched_data *q = qdisc_priv(sch);
696 	size_t n = nla_len(attr)/sizeof(__s16);
697 	const __s16 *data = nla_data(attr);
698 	spinlock_t *root_lock;
699 	struct disttable *d;
700 	int i;
701 
702 	if (n > NETEM_DIST_MAX)
703 		return -EINVAL;
704 
705 	d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL);
706 	if (!d)
707 		return -ENOMEM;
708 
709 	d->size = n;
710 	for (i = 0; i < n; i++)
711 		d->table[i] = data[i];
712 
713 	root_lock = qdisc_root_sleeping_lock(sch);
714 
715 	spin_lock_bh(root_lock);
716 	swap(q->delay_dist, d);
717 	spin_unlock_bh(root_lock);
718 
719 	dist_free(d);
720 	return 0;
721 }
722 
723 static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)
724 {
725 	const struct tc_netem_corr *c = nla_data(attr);
726 
727 	init_crandom(&q->delay_cor, c->delay_corr);
728 	init_crandom(&q->loss_cor, c->loss_corr);
729 	init_crandom(&q->dup_cor, c->dup_corr);
730 }
731 
732 static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr)
733 {
734 	const struct tc_netem_reorder *r = nla_data(attr);
735 
736 	q->reorder = r->probability;
737 	init_crandom(&q->reorder_cor, r->correlation);
738 }
739 
740 static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr)
741 {
742 	const struct tc_netem_corrupt *r = nla_data(attr);
743 
744 	q->corrupt = r->probability;
745 	init_crandom(&q->corrupt_cor, r->correlation);
746 }
747 
748 static void get_rate(struct netem_sched_data *q, const struct nlattr *attr)
749 {
750 	const struct tc_netem_rate *r = nla_data(attr);
751 
752 	q->rate = r->rate;
753 	q->packet_overhead = r->packet_overhead;
754 	q->cell_size = r->cell_size;
755 	q->cell_overhead = r->cell_overhead;
756 	if (q->cell_size)
757 		q->cell_size_reciprocal = reciprocal_value(q->cell_size);
758 	else
759 		q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
760 }
761 
762 static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr)
763 {
764 	const struct nlattr *la;
765 	int rem;
766 
767 	nla_for_each_nested(la, attr, rem) {
768 		u16 type = nla_type(la);
769 
770 		switch (type) {
771 		case NETEM_LOSS_GI: {
772 			const struct tc_netem_gimodel *gi = nla_data(la);
773 
774 			if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
775 				pr_info("netem: incorrect gi model size\n");
776 				return -EINVAL;
777 			}
778 
779 			q->loss_model = CLG_4_STATES;
780 
781 			q->clg.state = TX_IN_GAP_PERIOD;
782 			q->clg.a1 = gi->p13;
783 			q->clg.a2 = gi->p31;
784 			q->clg.a3 = gi->p32;
785 			q->clg.a4 = gi->p14;
786 			q->clg.a5 = gi->p23;
787 			break;
788 		}
789 
790 		case NETEM_LOSS_GE: {
791 			const struct tc_netem_gemodel *ge = nla_data(la);
792 
793 			if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
794 				pr_info("netem: incorrect ge model size\n");
795 				return -EINVAL;
796 			}
797 
798 			q->loss_model = CLG_GILB_ELL;
799 			q->clg.state = GOOD_STATE;
800 			q->clg.a1 = ge->p;
801 			q->clg.a2 = ge->r;
802 			q->clg.a3 = ge->h;
803 			q->clg.a4 = ge->k1;
804 			break;
805 		}
806 
807 		default:
808 			pr_info("netem: unknown loss type %u\n", type);
809 			return -EINVAL;
810 		}
811 	}
812 
813 	return 0;
814 }
815 
816 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
817 	[TCA_NETEM_CORR]	= { .len = sizeof(struct tc_netem_corr) },
818 	[TCA_NETEM_REORDER]	= { .len = sizeof(struct tc_netem_reorder) },
819 	[TCA_NETEM_CORRUPT]	= { .len = sizeof(struct tc_netem_corrupt) },
820 	[TCA_NETEM_RATE]	= { .len = sizeof(struct tc_netem_rate) },
821 	[TCA_NETEM_LOSS]	= { .type = NLA_NESTED },
822 	[TCA_NETEM_ECN]		= { .type = NLA_U32 },
823 	[TCA_NETEM_RATE64]	= { .type = NLA_U64 },
824 };
825 
826 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
827 		      const struct nla_policy *policy, int len)
828 {
829 	int nested_len = nla_len(nla) - NLA_ALIGN(len);
830 
831 	if (nested_len < 0) {
832 		pr_info("netem: invalid attributes len %d\n", nested_len);
833 		return -EINVAL;
834 	}
835 
836 	if (nested_len >= nla_attr_size(0))
837 		return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
838 				 nested_len, policy, NULL);
839 
840 	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
841 	return 0;
842 }
843 
844 /* Parse netlink message to set options */
845 static int netem_change(struct Qdisc *sch, struct nlattr *opt)
846 {
847 	struct netem_sched_data *q = qdisc_priv(sch);
848 	struct nlattr *tb[TCA_NETEM_MAX + 1];
849 	struct tc_netem_qopt *qopt;
850 	struct clgstate old_clg;
851 	int old_loss_model = CLG_RANDOM;
852 	int ret;
853 
854 	if (opt == NULL)
855 		return -EINVAL;
856 
857 	qopt = nla_data(opt);
858 	ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
859 	if (ret < 0)
860 		return ret;
861 
862 	/* backup q->clg and q->loss_model */
863 	old_clg = q->clg;
864 	old_loss_model = q->loss_model;
865 
866 	if (tb[TCA_NETEM_LOSS]) {
867 		ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]);
868 		if (ret) {
869 			q->loss_model = old_loss_model;
870 			return ret;
871 		}
872 	} else {
873 		q->loss_model = CLG_RANDOM;
874 	}
875 
876 	if (tb[TCA_NETEM_DELAY_DIST]) {
877 		ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
878 		if (ret) {
879 			/* recover clg and loss_model, in case of
880 			 * q->clg and q->loss_model were modified
881 			 * in get_loss_clg()
882 			 */
883 			q->clg = old_clg;
884 			q->loss_model = old_loss_model;
885 			return ret;
886 		}
887 	}
888 
889 	sch->limit = qopt->limit;
890 
891 	q->latency = qopt->latency;
892 	q->jitter = qopt->jitter;
893 	q->limit = qopt->limit;
894 	q->gap = qopt->gap;
895 	q->counter = 0;
896 	q->loss = qopt->loss;
897 	q->duplicate = qopt->duplicate;
898 
899 	/* for compatibility with earlier versions.
900 	 * if gap is set, need to assume 100% probability
901 	 */
902 	if (q->gap)
903 		q->reorder = ~0;
904 
905 	if (tb[TCA_NETEM_CORR])
906 		get_correlation(q, tb[TCA_NETEM_CORR]);
907 
908 	if (tb[TCA_NETEM_REORDER])
909 		get_reorder(q, tb[TCA_NETEM_REORDER]);
910 
911 	if (tb[TCA_NETEM_CORRUPT])
912 		get_corrupt(q, tb[TCA_NETEM_CORRUPT]);
913 
914 	if (tb[TCA_NETEM_RATE])
915 		get_rate(q, tb[TCA_NETEM_RATE]);
916 
917 	if (tb[TCA_NETEM_RATE64])
918 		q->rate = max_t(u64, q->rate,
919 				nla_get_u64(tb[TCA_NETEM_RATE64]));
920 
921 	if (tb[TCA_NETEM_ECN])
922 		q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
923 
924 	return ret;
925 }
926 
927 static int netem_init(struct Qdisc *sch, struct nlattr *opt)
928 {
929 	struct netem_sched_data *q = qdisc_priv(sch);
930 	int ret;
931 
932 	qdisc_watchdog_init(&q->watchdog, sch);
933 
934 	if (!opt)
935 		return -EINVAL;
936 
937 	q->loss_model = CLG_RANDOM;
938 	ret = netem_change(sch, opt);
939 	if (ret)
940 		pr_info("netem: change failed\n");
941 	return ret;
942 }
943 
944 static void netem_destroy(struct Qdisc *sch)
945 {
946 	struct netem_sched_data *q = qdisc_priv(sch);
947 
948 	qdisc_watchdog_cancel(&q->watchdog);
949 	if (q->qdisc)
950 		qdisc_destroy(q->qdisc);
951 	dist_free(q->delay_dist);
952 }
953 
954 static int dump_loss_model(const struct netem_sched_data *q,
955 			   struct sk_buff *skb)
956 {
957 	struct nlattr *nest;
958 
959 	nest = nla_nest_start(skb, TCA_NETEM_LOSS);
960 	if (nest == NULL)
961 		goto nla_put_failure;
962 
963 	switch (q->loss_model) {
964 	case CLG_RANDOM:
965 		/* legacy loss model */
966 		nla_nest_cancel(skb, nest);
967 		return 0;	/* no data */
968 
969 	case CLG_4_STATES: {
970 		struct tc_netem_gimodel gi = {
971 			.p13 = q->clg.a1,
972 			.p31 = q->clg.a2,
973 			.p32 = q->clg.a3,
974 			.p14 = q->clg.a4,
975 			.p23 = q->clg.a5,
976 		};
977 
978 		if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
979 			goto nla_put_failure;
980 		break;
981 	}
982 	case CLG_GILB_ELL: {
983 		struct tc_netem_gemodel ge = {
984 			.p = q->clg.a1,
985 			.r = q->clg.a2,
986 			.h = q->clg.a3,
987 			.k1 = q->clg.a4,
988 		};
989 
990 		if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
991 			goto nla_put_failure;
992 		break;
993 	}
994 	}
995 
996 	nla_nest_end(skb, nest);
997 	return 0;
998 
999 nla_put_failure:
1000 	nla_nest_cancel(skb, nest);
1001 	return -1;
1002 }
1003 
1004 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
1005 {
1006 	const struct netem_sched_data *q = qdisc_priv(sch);
1007 	struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
1008 	struct tc_netem_qopt qopt;
1009 	struct tc_netem_corr cor;
1010 	struct tc_netem_reorder reorder;
1011 	struct tc_netem_corrupt corrupt;
1012 	struct tc_netem_rate rate;
1013 
1014 	qopt.latency = q->latency;
1015 	qopt.jitter = q->jitter;
1016 	qopt.limit = q->limit;
1017 	qopt.loss = q->loss;
1018 	qopt.gap = q->gap;
1019 	qopt.duplicate = q->duplicate;
1020 	if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
1021 		goto nla_put_failure;
1022 
1023 	cor.delay_corr = q->delay_cor.rho;
1024 	cor.loss_corr = q->loss_cor.rho;
1025 	cor.dup_corr = q->dup_cor.rho;
1026 	if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
1027 		goto nla_put_failure;
1028 
1029 	reorder.probability = q->reorder;
1030 	reorder.correlation = q->reorder_cor.rho;
1031 	if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
1032 		goto nla_put_failure;
1033 
1034 	corrupt.probability = q->corrupt;
1035 	corrupt.correlation = q->corrupt_cor.rho;
1036 	if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
1037 		goto nla_put_failure;
1038 
1039 	if (q->rate >= (1ULL << 32)) {
1040 		if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate,
1041 				      TCA_NETEM_PAD))
1042 			goto nla_put_failure;
1043 		rate.rate = ~0U;
1044 	} else {
1045 		rate.rate = q->rate;
1046 	}
1047 	rate.packet_overhead = q->packet_overhead;
1048 	rate.cell_size = q->cell_size;
1049 	rate.cell_overhead = q->cell_overhead;
1050 	if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
1051 		goto nla_put_failure;
1052 
1053 	if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
1054 		goto nla_put_failure;
1055 
1056 	if (dump_loss_model(q, skb) != 0)
1057 		goto nla_put_failure;
1058 
1059 	return nla_nest_end(skb, nla);
1060 
1061 nla_put_failure:
1062 	nlmsg_trim(skb, nla);
1063 	return -1;
1064 }
1065 
1066 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
1067 			  struct sk_buff *skb, struct tcmsg *tcm)
1068 {
1069 	struct netem_sched_data *q = qdisc_priv(sch);
1070 
1071 	if (cl != 1 || !q->qdisc) 	/* only one class */
1072 		return -ENOENT;
1073 
1074 	tcm->tcm_handle |= TC_H_MIN(1);
1075 	tcm->tcm_info = q->qdisc->handle;
1076 
1077 	return 0;
1078 }
1079 
1080 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1081 		     struct Qdisc **old)
1082 {
1083 	struct netem_sched_data *q = qdisc_priv(sch);
1084 
1085 	*old = qdisc_replace(sch, new, &q->qdisc);
1086 	return 0;
1087 }
1088 
1089 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
1090 {
1091 	struct netem_sched_data *q = qdisc_priv(sch);
1092 	return q->qdisc;
1093 }
1094 
1095 static unsigned long netem_find(struct Qdisc *sch, u32 classid)
1096 {
1097 	return 1;
1098 }
1099 
1100 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
1101 {
1102 	if (!walker->stop) {
1103 		if (walker->count >= walker->skip)
1104 			if (walker->fn(sch, 1, walker) < 0) {
1105 				walker->stop = 1;
1106 				return;
1107 			}
1108 		walker->count++;
1109 	}
1110 }
1111 
1112 static const struct Qdisc_class_ops netem_class_ops = {
1113 	.graft		=	netem_graft,
1114 	.leaf		=	netem_leaf,
1115 	.find		=	netem_find,
1116 	.walk		=	netem_walk,
1117 	.dump		=	netem_dump_class,
1118 };
1119 
1120 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
1121 	.id		=	"netem",
1122 	.cl_ops		=	&netem_class_ops,
1123 	.priv_size	=	sizeof(struct netem_sched_data),
1124 	.enqueue	=	netem_enqueue,
1125 	.dequeue	=	netem_dequeue,
1126 	.peek		=	qdisc_peek_dequeued,
1127 	.init		=	netem_init,
1128 	.reset		=	netem_reset,
1129 	.destroy	=	netem_destroy,
1130 	.change		=	netem_change,
1131 	.dump		=	netem_dump,
1132 	.owner		=	THIS_MODULE,
1133 };
1134 
1135 
1136 static int __init netem_module_init(void)
1137 {
1138 	pr_info("netem: version " VERSION "\n");
1139 	return register_qdisc(&netem_qdisc_ops);
1140 }
1141 static void __exit netem_module_exit(void)
1142 {
1143 	unregister_qdisc(&netem_qdisc_ops);
1144 }
1145 module_init(netem_module_init)
1146 module_exit(netem_module_exit)
1147 MODULE_LICENSE("GPL");
1148