xref: /linux/net/sched/sch_netem.c (revision 60e13231561b3a4c5269bfa1ef6c0569ad6f28ec)
1 /*
2  * net/sched/sch_netem.c	Network emulator
3  *
4  * 		This program is free software; you can redistribute it and/or
5  * 		modify it under the terms of the GNU General Public License
6  * 		as published by the Free Software Foundation; either version
7  * 		2 of the License.
8  *
9  *  		Many of the algorithms and ideas for this came from
10  *		NIST Net which is not copyrighted.
11  *
12  * Authors:	Stephen Hemminger <shemminger@osdl.org>
13  *		Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
14  */
15 
16 #include <linux/mm.h>
17 #include <linux/module.h>
18 #include <linux/slab.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/errno.h>
22 #include <linux/skbuff.h>
23 #include <linux/vmalloc.h>
24 #include <linux/rtnetlink.h>
25 
26 #include <net/netlink.h>
27 #include <net/pkt_sched.h>
28 
29 #define VERSION "1.3"
30 
31 /*	Network Emulation Queuing algorithm.
32 	====================================
33 
34 	Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
35 		 Network Emulation Tool
36 		 [2] Luigi Rizzo, DummyNet for FreeBSD
37 
38 	 ----------------------------------------------------------------
39 
40 	 This started out as a simple way to delay outgoing packets to
41 	 test TCP but has grown to include most of the functionality
42 	 of a full blown network emulator like NISTnet. It can delay
43 	 packets and add random jitter (and correlation). The random
44 	 distribution can be loaded from a table as well to provide
45 	 normal, Pareto, or experimental curves. Packet loss,
46 	 duplication, and reordering can also be emulated.
47 
48 	 This qdisc does not do classification that can be handled in
49 	 layering other disciplines.  It does not need to do bandwidth
50 	 control either since that can be handled by using token
51 	 bucket or other rate control.
52 
53      Correlated Loss Generator models
54 
55 	Added generation of correlated loss according to the
56 	"Gilbert-Elliot" model, a 4-state markov model.
57 
58 	References:
59 	[1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
60 	[2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
61 	and intuitive loss model for packet networks and its implementation
62 	in the Netem module in the Linux kernel", available in [1]
63 
64 	Authors: Stefano Salsano <stefano.salsano at uniroma2.it
65 		 Fabio Ludovici <fabio.ludovici at yahoo.it>
66 */
67 
68 struct netem_sched_data {
69 	struct Qdisc	*qdisc;
70 	struct qdisc_watchdog watchdog;
71 
72 	psched_tdiff_t latency;
73 	psched_tdiff_t jitter;
74 
75 	u32 loss;
76 	u32 limit;
77 	u32 counter;
78 	u32 gap;
79 	u32 duplicate;
80 	u32 reorder;
81 	u32 corrupt;
82 
83 	struct crndstate {
84 		u32 last;
85 		u32 rho;
86 	} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
87 
88 	struct disttable {
89 		u32  size;
90 		s16 table[0];
91 	} *delay_dist;
92 
93 	enum  {
94 		CLG_RANDOM,
95 		CLG_4_STATES,
96 		CLG_GILB_ELL,
97 	} loss_model;
98 
99 	/* Correlated Loss Generation models */
100 	struct clgstate {
101 		/* state of the Markov chain */
102 		u8 state;
103 
104 		/* 4-states and Gilbert-Elliot models */
105 		u32 a1;	/* p13 for 4-states or p for GE */
106 		u32 a2;	/* p31 for 4-states or r for GE */
107 		u32 a3;	/* p32 for 4-states or h for GE */
108 		u32 a4;	/* p14 for 4-states or 1-k for GE */
109 		u32 a5; /* p23 used only in 4-states */
110 	} clg;
111 
112 };
113 
114 /* Time stamp put into socket buffer control block */
115 struct netem_skb_cb {
116 	psched_time_t	time_to_send;
117 };
118 
119 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
120 {
121 	BUILD_BUG_ON(sizeof(skb->cb) <
122 		sizeof(struct qdisc_skb_cb) + sizeof(struct netem_skb_cb));
123 	return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
124 }
125 
126 /* init_crandom - initialize correlated random number generator
127  * Use entropy source for initial seed.
128  */
129 static void init_crandom(struct crndstate *state, unsigned long rho)
130 {
131 	state->rho = rho;
132 	state->last = net_random();
133 }
134 
135 /* get_crandom - correlated random number generator
136  * Next number depends on last value.
137  * rho is scaled to avoid floating point.
138  */
139 static u32 get_crandom(struct crndstate *state)
140 {
141 	u64 value, rho;
142 	unsigned long answer;
143 
144 	if (state->rho == 0)	/* no correlation */
145 		return net_random();
146 
147 	value = net_random();
148 	rho = (u64)state->rho + 1;
149 	answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
150 	state->last = answer;
151 	return answer;
152 }
153 
154 /* loss_4state - 4-state model loss generator
155  * Generates losses according to the 4-state Markov chain adopted in
156  * the GI (General and Intuitive) loss model.
157  */
158 static bool loss_4state(struct netem_sched_data *q)
159 {
160 	struct clgstate *clg = &q->clg;
161 	u32 rnd = net_random();
162 
163 	/*
164 	 * Makes a comparison between rnd and the transition
165 	 * probabilities outgoing from the current state, then decides the
166 	 * next state and if the next packet has to be transmitted or lost.
167 	 * The four states correspond to:
168 	 *   1 => successfully transmitted packets within a gap period
169 	 *   4 => isolated losses within a gap period
170 	 *   3 => lost packets within a burst period
171 	 *   2 => successfully transmitted packets within a burst period
172 	 */
173 	switch (clg->state) {
174 	case 1:
175 		if (rnd < clg->a4) {
176 			clg->state = 4;
177 			return true;
178 		} else if (clg->a4 < rnd && rnd < clg->a1) {
179 			clg->state = 3;
180 			return true;
181 		} else if (clg->a1 < rnd)
182 			clg->state = 1;
183 
184 		break;
185 	case 2:
186 		if (rnd < clg->a5) {
187 			clg->state = 3;
188 			return true;
189 		} else
190 			clg->state = 2;
191 
192 		break;
193 	case 3:
194 		if (rnd < clg->a3)
195 			clg->state = 2;
196 		else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
197 			clg->state = 1;
198 			return true;
199 		} else if (clg->a2 + clg->a3 < rnd) {
200 			clg->state = 3;
201 			return true;
202 		}
203 		break;
204 	case 4:
205 		clg->state = 1;
206 		break;
207 	}
208 
209 	return false;
210 }
211 
212 /* loss_gilb_ell - Gilbert-Elliot model loss generator
213  * Generates losses according to the Gilbert-Elliot loss model or
214  * its special cases  (Gilbert or Simple Gilbert)
215  *
216  * Makes a comparison between random number and the transition
217  * probabilities outgoing from the current state, then decides the
218  * next state. A second random number is extracted and the comparison
219  * with the loss probability of the current state decides if the next
220  * packet will be transmitted or lost.
221  */
222 static bool loss_gilb_ell(struct netem_sched_data *q)
223 {
224 	struct clgstate *clg = &q->clg;
225 
226 	switch (clg->state) {
227 	case 1:
228 		if (net_random() < clg->a1)
229 			clg->state = 2;
230 		if (net_random() < clg->a4)
231 			return true;
232 	case 2:
233 		if (net_random() < clg->a2)
234 			clg->state = 1;
235 		if (clg->a3 > net_random())
236 			return true;
237 	}
238 
239 	return false;
240 }
241 
242 static bool loss_event(struct netem_sched_data *q)
243 {
244 	switch (q->loss_model) {
245 	case CLG_RANDOM:
246 		/* Random packet drop 0 => none, ~0 => all */
247 		return q->loss && q->loss >= get_crandom(&q->loss_cor);
248 
249 	case CLG_4_STATES:
250 		/* 4state loss model algorithm (used also for GI model)
251 		* Extracts a value from the markov 4 state loss generator,
252 		* if it is 1 drops a packet and if needed writes the event in
253 		* the kernel logs
254 		*/
255 		return loss_4state(q);
256 
257 	case CLG_GILB_ELL:
258 		/* Gilbert-Elliot loss model algorithm
259 		* Extracts a value from the Gilbert-Elliot loss generator,
260 		* if it is 1 drops a packet and if needed writes the event in
261 		* the kernel logs
262 		*/
263 		return loss_gilb_ell(q);
264 	}
265 
266 	return false;	/* not reached */
267 }
268 
269 
270 /* tabledist - return a pseudo-randomly distributed value with mean mu and
271  * std deviation sigma.  Uses table lookup to approximate the desired
272  * distribution, and a uniformly-distributed pseudo-random source.
273  */
274 static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
275 				struct crndstate *state,
276 				const struct disttable *dist)
277 {
278 	psched_tdiff_t x;
279 	long t;
280 	u32 rnd;
281 
282 	if (sigma == 0)
283 		return mu;
284 
285 	rnd = get_crandom(state);
286 
287 	/* default uniform distribution */
288 	if (dist == NULL)
289 		return (rnd % (2*sigma)) - sigma + mu;
290 
291 	t = dist->table[rnd % dist->size];
292 	x = (sigma % NETEM_DIST_SCALE) * t;
293 	if (x >= 0)
294 		x += NETEM_DIST_SCALE/2;
295 	else
296 		x -= NETEM_DIST_SCALE/2;
297 
298 	return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
299 }
300 
301 /*
302  * Insert one skb into qdisc.
303  * Note: parent depends on return value to account for queue length.
304  * 	NET_XMIT_DROP: queue length didn't change.
305  *      NET_XMIT_SUCCESS: one skb was queued.
306  */
307 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
308 {
309 	struct netem_sched_data *q = qdisc_priv(sch);
310 	/* We don't fill cb now as skb_unshare() may invalidate it */
311 	struct netem_skb_cb *cb;
312 	struct sk_buff *skb2;
313 	int ret;
314 	int count = 1;
315 
316 	/* Random duplication */
317 	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
318 		++count;
319 
320 	/* Drop packet? */
321 	if (loss_event(q))
322 		--count;
323 
324 	if (count == 0) {
325 		sch->qstats.drops++;
326 		kfree_skb(skb);
327 		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
328 	}
329 
330 	skb_orphan(skb);
331 
332 	/*
333 	 * If we need to duplicate packet, then re-insert at top of the
334 	 * qdisc tree, since parent queuer expects that only one
335 	 * skb will be queued.
336 	 */
337 	if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
338 		struct Qdisc *rootq = qdisc_root(sch);
339 		u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
340 		q->duplicate = 0;
341 
342 		qdisc_enqueue_root(skb2, rootq);
343 		q->duplicate = dupsave;
344 	}
345 
346 	/*
347 	 * Randomized packet corruption.
348 	 * Make copy if needed since we are modifying
349 	 * If packet is going to be hardware checksummed, then
350 	 * do it now in software before we mangle it.
351 	 */
352 	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
353 		if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
354 		    (skb->ip_summed == CHECKSUM_PARTIAL &&
355 		     skb_checksum_help(skb))) {
356 			sch->qstats.drops++;
357 			return NET_XMIT_DROP;
358 		}
359 
360 		skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
361 	}
362 
363 	cb = netem_skb_cb(skb);
364 	if (q->gap == 0 ||		/* not doing reordering */
365 	    q->counter < q->gap ||	/* inside last reordering gap */
366 	    q->reorder < get_crandom(&q->reorder_cor)) {
367 		psched_time_t now;
368 		psched_tdiff_t delay;
369 
370 		delay = tabledist(q->latency, q->jitter,
371 				  &q->delay_cor, q->delay_dist);
372 
373 		now = psched_get_time();
374 		cb->time_to_send = now + delay;
375 		++q->counter;
376 		ret = qdisc_enqueue(skb, q->qdisc);
377 	} else {
378 		/*
379 		 * Do re-ordering by putting one out of N packets at the front
380 		 * of the queue.
381 		 */
382 		cb->time_to_send = psched_get_time();
383 		q->counter = 0;
384 
385 		__skb_queue_head(&q->qdisc->q, skb);
386 		q->qdisc->qstats.backlog += qdisc_pkt_len(skb);
387 		q->qdisc->qstats.requeues++;
388 		ret = NET_XMIT_SUCCESS;
389 	}
390 
391 	if (ret != NET_XMIT_SUCCESS) {
392 		if (net_xmit_drop_count(ret)) {
393 			sch->qstats.drops++;
394 			return ret;
395 		}
396 	}
397 
398 	sch->q.qlen++;
399 	return NET_XMIT_SUCCESS;
400 }
401 
402 static unsigned int netem_drop(struct Qdisc *sch)
403 {
404 	struct netem_sched_data *q = qdisc_priv(sch);
405 	unsigned int len = 0;
406 
407 	if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
408 		sch->q.qlen--;
409 		sch->qstats.drops++;
410 	}
411 	return len;
412 }
413 
414 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
415 {
416 	struct netem_sched_data *q = qdisc_priv(sch);
417 	struct sk_buff *skb;
418 
419 	if (qdisc_is_throttled(sch))
420 		return NULL;
421 
422 	skb = q->qdisc->ops->peek(q->qdisc);
423 	if (skb) {
424 		const struct netem_skb_cb *cb = netem_skb_cb(skb);
425 		psched_time_t now = psched_get_time();
426 
427 		/* if more time remaining? */
428 		if (cb->time_to_send <= now) {
429 			skb = qdisc_dequeue_peeked(q->qdisc);
430 			if (unlikely(!skb))
431 				return NULL;
432 
433 #ifdef CONFIG_NET_CLS_ACT
434 			/*
435 			 * If it's at ingress let's pretend the delay is
436 			 * from the network (tstamp will be updated).
437 			 */
438 			if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
439 				skb->tstamp.tv64 = 0;
440 #endif
441 
442 			sch->q.qlen--;
443 			qdisc_unthrottled(sch);
444 			qdisc_bstats_update(sch, skb);
445 			return skb;
446 		}
447 
448 		qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);
449 	}
450 
451 	return NULL;
452 }
453 
454 static void netem_reset(struct Qdisc *sch)
455 {
456 	struct netem_sched_data *q = qdisc_priv(sch);
457 
458 	qdisc_reset(q->qdisc);
459 	sch->q.qlen = 0;
460 	qdisc_watchdog_cancel(&q->watchdog);
461 }
462 
463 static void dist_free(struct disttable *d)
464 {
465 	if (d) {
466 		if (is_vmalloc_addr(d))
467 			vfree(d);
468 		else
469 			kfree(d);
470 	}
471 }
472 
473 /*
474  * Distribution data is a variable size payload containing
475  * signed 16 bit values.
476  */
477 static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
478 {
479 	struct netem_sched_data *q = qdisc_priv(sch);
480 	size_t n = nla_len(attr)/sizeof(__s16);
481 	const __s16 *data = nla_data(attr);
482 	spinlock_t *root_lock;
483 	struct disttable *d;
484 	int i;
485 	size_t s;
486 
487 	if (n > NETEM_DIST_MAX)
488 		return -EINVAL;
489 
490 	s = sizeof(struct disttable) + n * sizeof(s16);
491 	d = kmalloc(s, GFP_KERNEL);
492 	if (!d)
493 		d = vmalloc(s);
494 	if (!d)
495 		return -ENOMEM;
496 
497 	d->size = n;
498 	for (i = 0; i < n; i++)
499 		d->table[i] = data[i];
500 
501 	root_lock = qdisc_root_sleeping_lock(sch);
502 
503 	spin_lock_bh(root_lock);
504 	dist_free(q->delay_dist);
505 	q->delay_dist = d;
506 	spin_unlock_bh(root_lock);
507 	return 0;
508 }
509 
510 static void get_correlation(struct Qdisc *sch, const struct nlattr *attr)
511 {
512 	struct netem_sched_data *q = qdisc_priv(sch);
513 	const struct tc_netem_corr *c = nla_data(attr);
514 
515 	init_crandom(&q->delay_cor, c->delay_corr);
516 	init_crandom(&q->loss_cor, c->loss_corr);
517 	init_crandom(&q->dup_cor, c->dup_corr);
518 }
519 
520 static void get_reorder(struct Qdisc *sch, const struct nlattr *attr)
521 {
522 	struct netem_sched_data *q = qdisc_priv(sch);
523 	const struct tc_netem_reorder *r = nla_data(attr);
524 
525 	q->reorder = r->probability;
526 	init_crandom(&q->reorder_cor, r->correlation);
527 }
528 
529 static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr)
530 {
531 	struct netem_sched_data *q = qdisc_priv(sch);
532 	const struct tc_netem_corrupt *r = nla_data(attr);
533 
534 	q->corrupt = r->probability;
535 	init_crandom(&q->corrupt_cor, r->correlation);
536 }
537 
538 static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
539 {
540 	struct netem_sched_data *q = qdisc_priv(sch);
541 	const struct nlattr *la;
542 	int rem;
543 
544 	nla_for_each_nested(la, attr, rem) {
545 		u16 type = nla_type(la);
546 
547 		switch(type) {
548 		case NETEM_LOSS_GI: {
549 			const struct tc_netem_gimodel *gi = nla_data(la);
550 
551 			if (nla_len(la) != sizeof(struct tc_netem_gimodel)) {
552 				pr_info("netem: incorrect gi model size\n");
553 				return -EINVAL;
554 			}
555 
556 			q->loss_model = CLG_4_STATES;
557 
558 			q->clg.state = 1;
559 			q->clg.a1 = gi->p13;
560 			q->clg.a2 = gi->p31;
561 			q->clg.a3 = gi->p32;
562 			q->clg.a4 = gi->p14;
563 			q->clg.a5 = gi->p23;
564 			break;
565 		}
566 
567 		case NETEM_LOSS_GE: {
568 			const struct tc_netem_gemodel *ge = nla_data(la);
569 
570 			if (nla_len(la) != sizeof(struct tc_netem_gemodel)) {
571 				pr_info("netem: incorrect gi model size\n");
572 				return -EINVAL;
573 			}
574 
575 			q->loss_model = CLG_GILB_ELL;
576 			q->clg.state = 1;
577 			q->clg.a1 = ge->p;
578 			q->clg.a2 = ge->r;
579 			q->clg.a3 = ge->h;
580 			q->clg.a4 = ge->k1;
581 			break;
582 		}
583 
584 		default:
585 			pr_info("netem: unknown loss type %u\n", type);
586 			return -EINVAL;
587 		}
588 	}
589 
590 	return 0;
591 }
592 
593 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
594 	[TCA_NETEM_CORR]	= { .len = sizeof(struct tc_netem_corr) },
595 	[TCA_NETEM_REORDER]	= { .len = sizeof(struct tc_netem_reorder) },
596 	[TCA_NETEM_CORRUPT]	= { .len = sizeof(struct tc_netem_corrupt) },
597 	[TCA_NETEM_LOSS]	= { .type = NLA_NESTED },
598 };
599 
600 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
601 		      const struct nla_policy *policy, int len)
602 {
603 	int nested_len = nla_len(nla) - NLA_ALIGN(len);
604 
605 	if (nested_len < 0) {
606 		pr_info("netem: invalid attributes len %d\n", nested_len);
607 		return -EINVAL;
608 	}
609 
610 	if (nested_len >= nla_attr_size(0))
611 		return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
612 				 nested_len, policy);
613 
614 	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
615 	return 0;
616 }
617 
618 /* Parse netlink message to set options */
619 static int netem_change(struct Qdisc *sch, struct nlattr *opt)
620 {
621 	struct netem_sched_data *q = qdisc_priv(sch);
622 	struct nlattr *tb[TCA_NETEM_MAX + 1];
623 	struct tc_netem_qopt *qopt;
624 	int ret;
625 
626 	if (opt == NULL)
627 		return -EINVAL;
628 
629 	qopt = nla_data(opt);
630 	ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
631 	if (ret < 0)
632 		return ret;
633 
634 	ret = fifo_set_limit(q->qdisc, qopt->limit);
635 	if (ret) {
636 		pr_info("netem: can't set fifo limit\n");
637 		return ret;
638 	}
639 
640 	q->latency = qopt->latency;
641 	q->jitter = qopt->jitter;
642 	q->limit = qopt->limit;
643 	q->gap = qopt->gap;
644 	q->counter = 0;
645 	q->loss = qopt->loss;
646 	q->duplicate = qopt->duplicate;
647 
648 	/* for compatibility with earlier versions.
649 	 * if gap is set, need to assume 100% probability
650 	 */
651 	if (q->gap)
652 		q->reorder = ~0;
653 
654 	if (tb[TCA_NETEM_CORR])
655 		get_correlation(sch, tb[TCA_NETEM_CORR]);
656 
657 	if (tb[TCA_NETEM_DELAY_DIST]) {
658 		ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
659 		if (ret)
660 			return ret;
661 	}
662 
663 	if (tb[TCA_NETEM_REORDER])
664 		get_reorder(sch, tb[TCA_NETEM_REORDER]);
665 
666 	if (tb[TCA_NETEM_CORRUPT])
667 		get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
668 
669 	q->loss_model = CLG_RANDOM;
670 	if (tb[TCA_NETEM_LOSS])
671 		ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]);
672 
673 	return ret;
674 }
675 
676 /*
677  * Special case version of FIFO queue for use by netem.
678  * It queues in order based on timestamps in skb's
679  */
680 struct fifo_sched_data {
681 	u32 limit;
682 	psched_time_t oldest;
683 };
684 
685 static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
686 {
687 	struct fifo_sched_data *q = qdisc_priv(sch);
688 	struct sk_buff_head *list = &sch->q;
689 	psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
690 	struct sk_buff *skb;
691 
692 	if (likely(skb_queue_len(list) < q->limit)) {
693 		/* Optimize for add at tail */
694 		if (likely(skb_queue_empty(list) || tnext >= q->oldest)) {
695 			q->oldest = tnext;
696 			return qdisc_enqueue_tail(nskb, sch);
697 		}
698 
699 		skb_queue_reverse_walk(list, skb) {
700 			const struct netem_skb_cb *cb = netem_skb_cb(skb);
701 
702 			if (tnext >= cb->time_to_send)
703 				break;
704 		}
705 
706 		__skb_queue_after(list, skb, nskb);
707 
708 		sch->qstats.backlog += qdisc_pkt_len(nskb);
709 
710 		return NET_XMIT_SUCCESS;
711 	}
712 
713 	return qdisc_reshape_fail(nskb, sch);
714 }
715 
716 static int tfifo_init(struct Qdisc *sch, struct nlattr *opt)
717 {
718 	struct fifo_sched_data *q = qdisc_priv(sch);
719 
720 	if (opt) {
721 		struct tc_fifo_qopt *ctl = nla_data(opt);
722 		if (nla_len(opt) < sizeof(*ctl))
723 			return -EINVAL;
724 
725 		q->limit = ctl->limit;
726 	} else
727 		q->limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1);
728 
729 	q->oldest = PSCHED_PASTPERFECT;
730 	return 0;
731 }
732 
733 static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb)
734 {
735 	struct fifo_sched_data *q = qdisc_priv(sch);
736 	struct tc_fifo_qopt opt = { .limit = q->limit };
737 
738 	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
739 	return skb->len;
740 
741 nla_put_failure:
742 	return -1;
743 }
744 
745 static struct Qdisc_ops tfifo_qdisc_ops __read_mostly = {
746 	.id		=	"tfifo",
747 	.priv_size	=	sizeof(struct fifo_sched_data),
748 	.enqueue	=	tfifo_enqueue,
749 	.dequeue	=	qdisc_dequeue_head,
750 	.peek		=	qdisc_peek_head,
751 	.drop		=	qdisc_queue_drop,
752 	.init		=	tfifo_init,
753 	.reset		=	qdisc_reset_queue,
754 	.change		=	tfifo_init,
755 	.dump		=	tfifo_dump,
756 };
757 
758 static int netem_init(struct Qdisc *sch, struct nlattr *opt)
759 {
760 	struct netem_sched_data *q = qdisc_priv(sch);
761 	int ret;
762 
763 	if (!opt)
764 		return -EINVAL;
765 
766 	qdisc_watchdog_init(&q->watchdog, sch);
767 
768 	q->loss_model = CLG_RANDOM;
769 	q->qdisc = qdisc_create_dflt(sch->dev_queue, &tfifo_qdisc_ops,
770 				     TC_H_MAKE(sch->handle, 1));
771 	if (!q->qdisc) {
772 		pr_notice("netem: qdisc create tfifo qdisc failed\n");
773 		return -ENOMEM;
774 	}
775 
776 	ret = netem_change(sch, opt);
777 	if (ret) {
778 		pr_info("netem: change failed\n");
779 		qdisc_destroy(q->qdisc);
780 	}
781 	return ret;
782 }
783 
784 static void netem_destroy(struct Qdisc *sch)
785 {
786 	struct netem_sched_data *q = qdisc_priv(sch);
787 
788 	qdisc_watchdog_cancel(&q->watchdog);
789 	qdisc_destroy(q->qdisc);
790 	dist_free(q->delay_dist);
791 }
792 
793 static int dump_loss_model(const struct netem_sched_data *q,
794 			   struct sk_buff *skb)
795 {
796 	struct nlattr *nest;
797 
798 	nest = nla_nest_start(skb, TCA_NETEM_LOSS);
799 	if (nest == NULL)
800 		goto nla_put_failure;
801 
802 	switch (q->loss_model) {
803 	case CLG_RANDOM:
804 		/* legacy loss model */
805 		nla_nest_cancel(skb, nest);
806 		return 0;	/* no data */
807 
808 	case CLG_4_STATES: {
809 		struct tc_netem_gimodel gi = {
810 			.p13 = q->clg.a1,
811 			.p31 = q->clg.a2,
812 			.p32 = q->clg.a3,
813 			.p14 = q->clg.a4,
814 			.p23 = q->clg.a5,
815 		};
816 
817 		NLA_PUT(skb, NETEM_LOSS_GI, sizeof(gi), &gi);
818 		break;
819 	}
820 	case CLG_GILB_ELL: {
821 		struct tc_netem_gemodel ge = {
822 			.p = q->clg.a1,
823 			.r = q->clg.a2,
824 			.h = q->clg.a3,
825 			.k1 = q->clg.a4,
826 		};
827 
828 		NLA_PUT(skb, NETEM_LOSS_GE, sizeof(ge), &ge);
829 		break;
830 	}
831 	}
832 
833 	nla_nest_end(skb, nest);
834 	return 0;
835 
836 nla_put_failure:
837 	nla_nest_cancel(skb, nest);
838 	return -1;
839 }
840 
841 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
842 {
843 	const struct netem_sched_data *q = qdisc_priv(sch);
844 	struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
845 	struct tc_netem_qopt qopt;
846 	struct tc_netem_corr cor;
847 	struct tc_netem_reorder reorder;
848 	struct tc_netem_corrupt corrupt;
849 
850 	qopt.latency = q->latency;
851 	qopt.jitter = q->jitter;
852 	qopt.limit = q->limit;
853 	qopt.loss = q->loss;
854 	qopt.gap = q->gap;
855 	qopt.duplicate = q->duplicate;
856 	NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
857 
858 	cor.delay_corr = q->delay_cor.rho;
859 	cor.loss_corr = q->loss_cor.rho;
860 	cor.dup_corr = q->dup_cor.rho;
861 	NLA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
862 
863 	reorder.probability = q->reorder;
864 	reorder.correlation = q->reorder_cor.rho;
865 	NLA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder);
866 
867 	corrupt.probability = q->corrupt;
868 	corrupt.correlation = q->corrupt_cor.rho;
869 	NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
870 
871 	if (dump_loss_model(q, skb) != 0)
872 		goto nla_put_failure;
873 
874 	return nla_nest_end(skb, nla);
875 
876 nla_put_failure:
877 	nlmsg_trim(skb, nla);
878 	return -1;
879 }
880 
881 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
882 			  struct sk_buff *skb, struct tcmsg *tcm)
883 {
884 	struct netem_sched_data *q = qdisc_priv(sch);
885 
886 	if (cl != 1) 	/* only one class */
887 		return -ENOENT;
888 
889 	tcm->tcm_handle |= TC_H_MIN(1);
890 	tcm->tcm_info = q->qdisc->handle;
891 
892 	return 0;
893 }
894 
895 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
896 		     struct Qdisc **old)
897 {
898 	struct netem_sched_data *q = qdisc_priv(sch);
899 
900 	if (new == NULL)
901 		new = &noop_qdisc;
902 
903 	sch_tree_lock(sch);
904 	*old = q->qdisc;
905 	q->qdisc = new;
906 	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
907 	qdisc_reset(*old);
908 	sch_tree_unlock(sch);
909 
910 	return 0;
911 }
912 
913 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
914 {
915 	struct netem_sched_data *q = qdisc_priv(sch);
916 	return q->qdisc;
917 }
918 
919 static unsigned long netem_get(struct Qdisc *sch, u32 classid)
920 {
921 	return 1;
922 }
923 
924 static void netem_put(struct Qdisc *sch, unsigned long arg)
925 {
926 }
927 
928 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
929 {
930 	if (!walker->stop) {
931 		if (walker->count >= walker->skip)
932 			if (walker->fn(sch, 1, walker) < 0) {
933 				walker->stop = 1;
934 				return;
935 			}
936 		walker->count++;
937 	}
938 }
939 
940 static const struct Qdisc_class_ops netem_class_ops = {
941 	.graft		=	netem_graft,
942 	.leaf		=	netem_leaf,
943 	.get		=	netem_get,
944 	.put		=	netem_put,
945 	.walk		=	netem_walk,
946 	.dump		=	netem_dump_class,
947 };
948 
949 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
950 	.id		=	"netem",
951 	.cl_ops		=	&netem_class_ops,
952 	.priv_size	=	sizeof(struct netem_sched_data),
953 	.enqueue	=	netem_enqueue,
954 	.dequeue	=	netem_dequeue,
955 	.peek		=	qdisc_peek_dequeued,
956 	.drop		=	netem_drop,
957 	.init		=	netem_init,
958 	.reset		=	netem_reset,
959 	.destroy	=	netem_destroy,
960 	.change		=	netem_change,
961 	.dump		=	netem_dump,
962 	.owner		=	THIS_MODULE,
963 };
964 
965 
966 static int __init netem_module_init(void)
967 {
968 	pr_info("netem: version " VERSION "\n");
969 	return register_qdisc(&netem_qdisc_ops);
970 }
971 static void __exit netem_module_exit(void)
972 {
973 	unregister_qdisc(&netem_qdisc_ops);
974 }
975 module_init(netem_module_init)
976 module_exit(netem_module_exit)
977 MODULE_LICENSE("GPL");
978