xref: /linux/net/sched/sch_netem.c (revision ecbdf3da7813db00014ff8b091bb66303bba29a0)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * net/sched/sch_netem.c	Network emulator
4  *
5  *  		Many of the algorithms and ideas for this came from
6  *		NIST Net which is not copyrighted.
7  *
8  * Authors:	Stephen Hemminger <shemminger@osdl.org>
9  *		Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
10  */
11 
12 #include <linux/mm.h>
13 #include <linux/module.h>
14 #include <linux/slab.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/errno.h>
18 #include <linux/skbuff.h>
19 #include <linux/vmalloc.h>
20 #include <linux/prandom.h>
21 #include <linux/rtnetlink.h>
22 #include <linux/reciprocal_div.h>
23 #include <linux/rbtree.h>
24 
25 #include <net/gso.h>
26 #include <net/netlink.h>
27 #include <net/pkt_sched.h>
28 #include <net/inet_ecn.h>
29 
30 /*	Network Emulation Queuing algorithm.
31 	====================================
32 
33 	Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
34 		 Network Emulation Tool
35 		 [2] Luigi Rizzo, DummyNet for FreeBSD
36 
37 	 ----------------------------------------------------------------
38 
39 	 This started out as a simple way to delay outgoing packets to
40 	 test TCP but has grown to include most of the functionality
41 	 of a full blown network emulator like NISTnet. It can delay
42 	 packets and add random jitter (and correlation). The random
43 	 distribution can be loaded from a table as well to provide
44 	 normal, Pareto, or experimental curves. Packet loss,
45 	 duplication, and reordering can also be emulated.
46 
47 	 This qdisc does not do classification that can be handled in
48 	 layering other disciplines.  It does not need to do bandwidth
49 	 control either since that can be handled by using token
50 	 bucket or other rate control.
51 
52      Correlated Loss Generator models
53 
54 	Added generation of correlated loss according to the
55 	"Gilbert-Elliot" model, a 4-state markov model.
56 
57 	References:
58 	[1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
59 	[2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
60 	and intuitive loss model for packet networks and its implementation
61 	in the Netem module in the Linux kernel", available in [1]
62 
63 	Authors: Stefano Salsano <stefano.salsano at uniroma2.it
64 		 Fabio Ludovici <fabio.ludovici at yahoo.it>
65 */
66 
67 struct disttable {
68 	u32  size;
69 	s16 table[] __counted_by(size);
70 };
71 
72 /* Loss models */
73 enum {
74 	CLG_RANDOM,
75 	CLG_4_STATES,
76 	CLG_GILB_ELL,
77 };
78 
79 /* States in GE model */
80 enum {
81 	GOOD_STATE = 1,
82 	BAD_STATE,
83 };
84 
85 /* States in 4 state model */
86 enum {
87 	TX_IN_GAP_PERIOD = 1,
88 	TX_IN_BURST_PERIOD,
89 	LOST_IN_GAP_PERIOD,
90 	LOST_IN_BURST_PERIOD,
91 };
92 
93 struct netem_sched_data {
94 	/* Cacheline 0: tfifo state and per-packet enqueue/dequeue scalars. */
95 	struct rb_root		t_root;
96 	struct sk_buff		*t_head;
97 	struct sk_buff		*t_tail;
98 	u32			t_len;
99 	u32			counter;
100 	s64			latency;
101 	s64			jitter;
102 	u64			rate;
103 	u32			gap;
104 	u32			loss;
105 
106 	/* Cacheline 1: zero-check scalars and correlation states. */
107 	u32			duplicate;
108 	u32			reorder;
109 	u32			corrupt;
110 	u32			ecn;
111 	struct crndstate {
112 		u32 last;
113 		u32 rho;
114 	} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
115 	u8			loss_model;
116 
117 	/* Cacheline 2: PRNG, distribution tables, slot dequeue state etc. */
118 	struct prng {
119 		u64 seed;
120 		struct rnd_state prng_state;
121 	} prng;
122 	struct disttable	*delay_dist;
123 	struct slotstate {
124 		u64 slot_next;
125 		s32 packets_left;
126 		s32 bytes_left;
127 	} slot;
128 	struct disttable	*slot_dist;
129 	struct Qdisc		*qdisc;
130 
131 	/*
132 	 * Warm: rate-shaping parameters (only read when rate != 0) and
133 	 * configuration-only fields.  The fast path reads sch->limit, not
134 	 * q->limit.
135 	 */
136 	s32			packet_overhead;
137 	u32			cell_size;
138 	struct reciprocal_value	cell_size_reciprocal;
139 	s32			cell_overhead;
140 	u32			limit;
141 
142 	/* Correlated Loss Generation models */
143 	struct clgstate {
144 		/* 4-states and Gilbert-Elliot models */
145 		u32 a1;	/* p13 for 4-states or p for GE */
146 		u32 a2;	/* p31 for 4-states or r for GE */
147 		u32 a3;	/* p32 for 4-states or h for GE */
148 		u32 a4;	/* p14 for 4-states or 1-k for GE */
149 		u32 a5; /* p23 used only in 4-states */
150 
151 		/* state of the Markov chain */
152 		u8  state;
153 	} clg;
154 
155 	/* Impairment counters */
156 	u64			delayed;
157 	u64			dropped;
158 	u64			corrupted;
159 	u64			duplicated;
160 	u64			ecn_marked;
161 	u64			reordered;
162 	u64			allocation_errors;
163 
164 	/* Cold tail: slot reschedule config and the watchdog timer. */
165 	struct tc_netem_slot	slot_config;
166 	struct qdisc_watchdog	watchdog;
167 };
168 
169 /* Time stamp put into socket buffer control block
170  * Only valid when skbs are in our internal t(ime)fifo queue.
171  *
172  * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp,
173  * and skb->next & skb->prev are scratch space for a qdisc,
174  * we save skb->tstamp value in skb->cb[] before destroying it.
175  */
176 struct netem_skb_cb {
177 	u64	        time_to_send;
178 };
179 
180 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
181 {
182 	/* we assume we can use skb next/prev/tstamp as storage for rb_node */
183 	qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
184 	return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
185 }
186 
187 /* init_crandom - initialize correlated random number generator
188  * Use entropy source for initial seed.
189  */
190 static void init_crandom(struct crndstate *state, unsigned long rho)
191 {
192 	state->rho = rho;
193 	state->last = get_random_u32();
194 }
195 
196 /* get_crandom - correlated random number generator
197  * Next number depends on last value.
198  * rho is scaled to avoid floating point.
199  */
200 static u32 get_crandom(struct crndstate *state, struct prng *p)
201 {
202 	u64 value, rho;
203 	unsigned long answer;
204 	struct rnd_state *s = &p->prng_state;
205 
206 	if (!state || state->rho == 0)	/* no correlation */
207 		return prandom_u32_state(s);
208 
209 	value = prandom_u32_state(s);
210 	rho = (u64)state->rho + 1;
211 	answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
212 	state->last = answer;
213 	return answer;
214 }
215 
216 /* loss_4state - 4-state model loss generator
217  * Generates losses according to the 4-state Markov chain adopted in
218  * the GI (General and Intuitive) loss model.
219  */
220 static bool loss_4state(struct netem_sched_data *q)
221 {
222 	struct clgstate *clg = &q->clg;
223 	u32 rnd = prandom_u32_state(&q->prng.prng_state);
224 
225 	/*
226 	 * Makes a comparison between rnd and the transition
227 	 * probabilities outgoing from the current state, then decides the
228 	 * next state and if the next packet has to be transmitted or lost.
229 	 * The four states correspond to:
230 	 *   TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
231 	 *   LOST_IN_GAP_PERIOD => isolated losses within a gap period
232 	 *   LOST_IN_BURST_PERIOD => lost packets within a burst period
233 	 *   TX_IN_BURST_PERIOD => successfully transmitted packets within a burst period
234 	 */
235 	switch (clg->state) {
236 	case TX_IN_GAP_PERIOD:
237 		if (rnd < clg->a4) {
238 			clg->state = LOST_IN_GAP_PERIOD;
239 			return true;
240 		} else if (rnd < clg->a1 + clg->a4) {
241 			clg->state = LOST_IN_BURST_PERIOD;
242 			return true;
243 		} else {
244 			clg->state = TX_IN_GAP_PERIOD;
245 		}
246 
247 		break;
248 	case TX_IN_BURST_PERIOD:
249 		if (rnd < clg->a5) {
250 			clg->state = LOST_IN_BURST_PERIOD;
251 			return true;
252 		} else {
253 			clg->state = TX_IN_BURST_PERIOD;
254 		}
255 
256 		break;
257 	case LOST_IN_BURST_PERIOD:
258 		if (rnd < clg->a3)
259 			clg->state = TX_IN_BURST_PERIOD;
260 		else if (rnd < clg->a2 + clg->a3) {
261 			clg->state = TX_IN_GAP_PERIOD;
262 		} else {
263 			clg->state = LOST_IN_BURST_PERIOD;
264 			return true;
265 		}
266 		break;
267 	case LOST_IN_GAP_PERIOD:
268 		clg->state = TX_IN_GAP_PERIOD;
269 		break;
270 	}
271 
272 	return false;
273 }
274 
275 /* loss_gilb_ell - Gilbert-Elliot model loss generator
276  * Generates losses according to the Gilbert-Elliot loss model or
277  * its special cases  (Gilbert or Simple Gilbert)
278  *
279  * Makes a comparison between random number and the transition
280  * probabilities outgoing from the current state, then decides the
281  * next state. A second random number is extracted and the comparison
282  * with the loss probability of the current state decides if the next
283  * packet will be transmitted or lost.
284  */
285 static bool loss_gilb_ell(struct netem_sched_data *q)
286 {
287 	struct clgstate *clg = &q->clg;
288 	struct rnd_state *s = &q->prng.prng_state;
289 
290 	switch (clg->state) {
291 	case GOOD_STATE:
292 		if (prandom_u32_state(s) < clg->a1)
293 			clg->state = BAD_STATE;
294 		if (prandom_u32_state(s) < clg->a4)
295 			return true;
296 		break;
297 	case BAD_STATE:
298 		if (prandom_u32_state(s) < clg->a2)
299 			clg->state = GOOD_STATE;
300 		if (prandom_u32_state(s) > clg->a3)
301 			return true;
302 	}
303 
304 	return false;
305 }
306 
307 static bool loss_event(struct netem_sched_data *q)
308 {
309 	switch (q->loss_model) {
310 	case CLG_RANDOM:
311 		/* Random packet drop 0 => none, ~0 => all */
312 		return q->loss && q->loss >= get_crandom(&q->loss_cor, &q->prng);
313 
314 	case CLG_4_STATES:
315 		/* 4state loss model algorithm (used also for GI model)
316 		* Extracts a value from the markov 4 state loss generator,
317 		* if it is 1 drops a packet and if needed writes the event in
318 		* the kernel logs
319 		*/
320 		return loss_4state(q);
321 
322 	case CLG_GILB_ELL:
323 		/* Gilbert-Elliot loss model algorithm
324 		* Extracts a value from the Gilbert-Elliot loss generator,
325 		* if it is 1 drops a packet and if needed writes the event in
326 		* the kernel logs
327 		*/
328 		return loss_gilb_ell(q);
329 	}
330 
331 	return false;	/* not reached */
332 }
333 
334 
335 /* tabledist - return a pseudo-randomly distributed value with mean mu and
336  * std deviation sigma.  Uses table lookup to approximate the desired
337  * distribution, and a uniformly-distributed pseudo-random source.
338  */
339 static s64 tabledist(s64 mu, s32 sigma,
340 		     struct crndstate *state,
341 		     struct prng *prng,
342 		     const struct disttable *dist)
343 {
344 	s64 x;
345 	long t;
346 	u32 rnd;
347 
348 	if (sigma == 0)
349 		return mu;
350 
351 	rnd = get_crandom(state, prng);
352 
353 	/* default uniform distribution */
354 	if (dist == NULL)
355 		return ((rnd % (2 * (u32)sigma)) + mu) - sigma;
356 
357 	t = dist->table[rnd % dist->size];
358 	x = (sigma % NETEM_DIST_SCALE) * t;
359 	if (x >= 0)
360 		x += NETEM_DIST_SCALE/2;
361 	else
362 		x -= NETEM_DIST_SCALE/2;
363 
364 	return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
365 }
366 
367 static u64 packet_time_ns(u64 len, const struct netem_sched_data *q)
368 {
369 	len += q->packet_overhead;
370 
371 	if (q->cell_size) {
372 		u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
373 
374 		if (len > cells * q->cell_size)	/* extra cell needed for remainder */
375 			cells++;
376 		len = cells * (q->cell_size + q->cell_overhead);
377 	}
378 
379 	return div64_u64(len * NSEC_PER_SEC, q->rate);
380 }
381 
382 static void tfifo_reset(struct Qdisc *sch)
383 {
384 	struct netem_sched_data *q = qdisc_priv(sch);
385 	struct rb_node *p = rb_first(&q->t_root);
386 
387 	while (p) {
388 		struct sk_buff *skb = rb_to_skb(p);
389 
390 		p = rb_next(p);
391 		rb_erase(&skb->rbnode, &q->t_root);
392 		rtnl_kfree_skbs(skb, skb);
393 	}
394 
395 	rtnl_kfree_skbs(q->t_head, q->t_tail);
396 	q->t_head = NULL;
397 	q->t_tail = NULL;
398 	q->t_len = 0;
399 }
400 
401 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
402 {
403 	struct netem_sched_data *q = qdisc_priv(sch);
404 	u64 tnext = netem_skb_cb(nskb)->time_to_send;
405 
406 	if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) {
407 		if (q->t_tail)
408 			q->t_tail->next = nskb;
409 		else
410 			q->t_head = nskb;
411 		q->t_tail = nskb;
412 	} else {
413 		struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
414 
415 		while (*p) {
416 			struct sk_buff *skb;
417 
418 			parent = *p;
419 			skb = rb_to_skb(parent);
420 			if (tnext >= netem_skb_cb(skb)->time_to_send)
421 				p = &parent->rb_right;
422 			else
423 				p = &parent->rb_left;
424 		}
425 		rb_link_node(&nskb->rbnode, parent, p);
426 		rb_insert_color(&nskb->rbnode, &q->t_root);
427 	}
428 	q->t_len++;
429 	qdisc_qlen_inc(sch);
430 }
431 
432 /* netem can't properly corrupt a megapacket (like we get from GSO), so instead
433  * when we statistically choose to corrupt one, we instead segment it, returning
434  * the first packet to be corrupted, and re-enqueue the remaining frames
435  */
436 static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
437 				     struct sk_buff **to_free)
438 {
439 	struct sk_buff *segs;
440 	netdev_features_t features = netif_skb_features(skb);
441 
442 	qdisc_skb_cb(skb)->pkt_segs = 1;
443 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
444 
445 	if (IS_ERR_OR_NULL(segs)) {
446 		qdisc_drop(skb, sch, to_free);
447 		return NULL;
448 	}
449 	consume_skb(skb);
450 	return segs;
451 }
452 
453 /*
454  * Insert one skb into qdisc.
455  * Note: parent depends on return value to account for queue length.
456  * 	NET_XMIT_DROP: queue length didn't change.
457  *      NET_XMIT_SUCCESS: one skb was queued.
458  */
459 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
460 			 struct sk_buff **to_free)
461 {
462 	struct netem_sched_data *q = qdisc_priv(sch);
463 	/* We don't fill cb now as skb_unshare() may invalidate it */
464 	struct netem_skb_cb *cb;
465 	struct sk_buff *skb2 = NULL;
466 	struct sk_buff *segs = NULL;
467 	unsigned int prev_len = qdisc_pkt_len(skb);
468 	int count = 1;
469 
470 	/* Do not fool qdisc_drop_all() */
471 	skb->prev = NULL;
472 
473 	/* Random duplication */
474 	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor, &q->prng)) {
475 		++count;
476 		WRITE_ONCE(q->duplicated, q->duplicated + 1);
477 	}
478 
479 	/* Drop packet? */
480 	if (loss_event(q)) {
481 		if (q->ecn && INET_ECN_set_ce(skb)) {
482 			WRITE_ONCE(q->ecn_marked, q->ecn_marked + 1);
483 		} else {
484 			WRITE_ONCE(q->dropped, q->dropped + 1);
485 			--count;
486 		}
487 	}
488 
489 	if (count == 0) {
490 		qdisc_qstats_drop(sch);
491 		__qdisc_drop(skb, to_free);
492 		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
493 	}
494 
495 	/* If a delay is expected, orphan the skb. (orphaning usually takes
496 	 * place at TX completion time, so _before_ the link transit delay)
497 	 */
498 	if (q->latency || q->jitter || q->rate)
499 		skb_orphan_partial(skb);
500 
501 	/*
502 	 * If we need to duplicate packet, then clone it before
503 	 * original is modified.
504 	 */
505 	if (count > 1) {
506 		skb2 = skb_clone(skb, GFP_ATOMIC);
507 		if (!skb2)
508 			WRITE_ONCE(q->allocation_errors, q->allocation_errors + 1);
509 	}
510 
511 	/*
512 	 * Randomized packet corruption.
513 	 * Make copy if needed since we are modifying
514 	 * If packet is going to be hardware checksummed, then
515 	 * do it now in software before we mangle it.
516 	 */
517 	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor, &q->prng)) {
518 		if (skb_is_gso(skb)) {
519 			skb = netem_segment(skb, sch, to_free);
520 			if (!skb) {
521 				WRITE_ONCE(q->allocation_errors, q->allocation_errors + 1);
522 				goto finish_segs;
523 			}
524 
525 			segs = skb->next;
526 			skb_mark_not_on_list(skb);
527 			qdisc_skb_cb(skb)->pkt_len = skb->len;
528 		}
529 
530 		skb = skb_unshare(skb, GFP_ATOMIC);
531 		if (unlikely(!skb)) {
532 			WRITE_ONCE(q->allocation_errors, q->allocation_errors + 1);
533 			qdisc_qstats_drop(sch);
534 			goto finish_segs;
535 		}
536 		if (skb_linearize(skb) ||
537 		    (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))) {
538 			WRITE_ONCE(q->allocation_errors, q->allocation_errors + 1);
539 			qdisc_drop(skb, sch, to_free);
540 			skb = NULL;
541 			goto finish_segs;
542 		}
543 
544 		if (skb->len) {
545 			u32 offset = get_random_u32_below(skb->len);
546 			skb->data[offset] ^= 1 << get_random_u32_below(8);
547 			WRITE_ONCE(q->corrupted, q->corrupted + 1);
548 		}
549 	}
550 
551 	if (unlikely(sch->q.qlen >= sch->limit)) {
552 		/* re-link segs, so that qdisc_drop_all() frees them all */
553 		skb->next = segs;
554 		qdisc_drop_all(skb, sch, to_free);
555 		if (skb2)
556 			__qdisc_drop(skb2, to_free);
557 		return NET_XMIT_DROP;
558 	}
559 
560 	/*
561 	 * If doing duplication then re-insert at top of the
562 	 * qdisc tree, since parent queuer expects that only one
563 	 * skb will be queued.
564 	 */
565 	if (skb2) {
566 		struct Qdisc *rootq = qdisc_root_bh(sch);
567 		u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
568 
569 		q->duplicate = 0;
570 		rootq->enqueue(skb2, rootq, to_free);
571 		q->duplicate = dupsave;
572 		skb2 = NULL;
573 	}
574 
575 	qdisc_qstats_backlog_inc(sch, skb);
576 
577 	cb = netem_skb_cb(skb);
578 	if (q->gap == 0 ||		/* not doing reordering */
579 	    q->counter < q->gap - 1 ||	/* inside last reordering gap */
580 	    q->reorder < get_crandom(&q->reorder_cor, &q->prng)) {
581 		u64 now;
582 		s64 delay;
583 
584 		delay = tabledist(q->latency, q->jitter,
585 				  &q->delay_cor, &q->prng, q->delay_dist);
586 
587 		now = ktime_get_ns();
588 
589 		if (q->rate) {
590 			struct netem_skb_cb *last = NULL;
591 
592 			if (sch->q.tail)
593 				last = netem_skb_cb(sch->q.tail);
594 			if (q->t_root.rb_node) {
595 				struct sk_buff *t_skb;
596 				struct netem_skb_cb *t_last;
597 
598 				t_skb = skb_rb_last(&q->t_root);
599 				t_last = netem_skb_cb(t_skb);
600 				if (!last ||
601 				    t_last->time_to_send > last->time_to_send)
602 					last = t_last;
603 			}
604 			if (q->t_tail) {
605 				struct netem_skb_cb *t_last =
606 					netem_skb_cb(q->t_tail);
607 
608 				if (!last ||
609 				    t_last->time_to_send > last->time_to_send)
610 					last = t_last;
611 			}
612 
613 			if (last) {
614 				/*
615 				 * Last packet in queue is reference point (now),
616 				 * calculate this time bonus and subtract
617 				 * from delay.
618 				 */
619 				delay -= last->time_to_send - now;
620 				delay = max_t(s64, 0, delay);
621 				now = last->time_to_send;
622 			}
623 
624 			delay += packet_time_ns(qdisc_pkt_len(skb), q);
625 		}
626 
627 		cb->time_to_send = now + delay;
628 		++q->counter;
629 		if (delay)
630 			WRITE_ONCE(q->delayed, q->delayed + 1);
631 
632 		tfifo_enqueue(skb, sch);
633 	} else {
634 		/*
635 		 * Do re-ordering by putting one out of N packets at the front
636 		 * of the queue.
637 		 */
638 		WRITE_ONCE(q->reordered, q->reordered + 1);
639 		cb->time_to_send = ktime_get_ns();
640 		q->counter = 0;
641 
642 		__qdisc_enqueue_head(skb, &sch->q);
643 		sch->qstats.requeues++;
644 	}
645 
646 finish_segs:
647 	if (skb2)
648 		__qdisc_drop(skb2, to_free);
649 
650 	if (segs) {
651 		unsigned int len, last_len;
652 		int rc, nb;
653 
654 		len = skb ? skb->len : 0;
655 		nb = skb ? 1 : 0;
656 
657 		while (segs) {
658 			skb2 = segs->next;
659 			skb_mark_not_on_list(segs);
660 			qdisc_skb_cb(segs)->pkt_len = segs->len;
661 			last_len = segs->len;
662 			rc = qdisc_enqueue(segs, sch, to_free);
663 			if (rc != NET_XMIT_SUCCESS) {
664 				if (net_xmit_drop_count(rc))
665 					qdisc_qstats_drop(sch);
666 			} else {
667 				nb++;
668 				len += last_len;
669 			}
670 			segs = skb2;
671 		}
672 		/* Parent qdiscs accounted for 1 skb of size @prev_len */
673 		qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len));
674 	} else if (!skb) {
675 		return NET_XMIT_DROP;
676 	}
677 	return NET_XMIT_SUCCESS;
678 }
679 
680 /* Delay the next round with a new future slot with a
681  * correct number of bytes and packets.
682  */
683 
684 static void get_slot_next(struct netem_sched_data *q, u64 now)
685 {
686 	s64 next_delay;
687 
688 	if (!q->slot_dist)
689 		next_delay = q->slot_config.min_delay +
690 			mul_u64_u32_shr(q->slot_config.max_delay - q->slot_config.min_delay,
691 					get_random_u32(), 32);
692 	else
693 		next_delay = tabledist(q->slot_config.dist_delay,
694 				       (s32)(q->slot_config.dist_jitter),
695 				       NULL, &q->prng, q->slot_dist);
696 
697 	q->slot.slot_next = now + next_delay;
698 	q->slot.packets_left = q->slot_config.max_packets;
699 	q->slot.bytes_left = q->slot_config.max_bytes;
700 }
701 
702 static struct sk_buff *netem_peek(struct netem_sched_data *q)
703 {
704 	struct sk_buff *skb = skb_rb_first(&q->t_root);
705 	u64 t1, t2;
706 
707 	if (!skb)
708 		return q->t_head;
709 	if (!q->t_head)
710 		return skb;
711 
712 	t1 = netem_skb_cb(skb)->time_to_send;
713 	t2 = netem_skb_cb(q->t_head)->time_to_send;
714 	if (t1 < t2)
715 		return skb;
716 	return q->t_head;
717 }
718 
719 static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb)
720 {
721 	if (skb == q->t_head) {
722 		q->t_head = skb->next;
723 		if (!q->t_head)
724 			q->t_tail = NULL;
725 	} else {
726 		rb_erase(&skb->rbnode, &q->t_root);
727 	}
728 }
729 
730 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
731 {
732 	struct netem_sched_data *q = qdisc_priv(sch);
733 	struct sk_buff *skb;
734 
735 tfifo_dequeue:
736 	skb = __qdisc_dequeue_head(&sch->q);
737 	if (skb) {
738 deliver:
739 		qdisc_qstats_backlog_dec(sch, skb);
740 		qdisc_bstats_update(sch, skb);
741 		return skb;
742 	}
743 	skb = netem_peek(q);
744 	if (skb) {
745 		u64 time_to_send;
746 		u64 now = ktime_get_ns();
747 
748 		/* if more time remaining? */
749 		time_to_send = netem_skb_cb(skb)->time_to_send;
750 		if (q->slot.slot_next && q->slot.slot_next < time_to_send)
751 			get_slot_next(q, now);
752 
753 		if (time_to_send <= now && q->slot.slot_next <= now) {
754 			netem_erase_head(q, skb);
755 			q->t_len--;
756 			skb->next = NULL;
757 			skb->prev = NULL;
758 			/* skb->dev shares skb->rbnode area,
759 			 * we need to restore its value.
760 			 */
761 			skb->dev = qdisc_dev(sch);
762 
763 			if (q->slot.slot_next) {
764 				q->slot.packets_left--;
765 				q->slot.bytes_left -= qdisc_pkt_len(skb);
766 				if (q->slot.packets_left <= 0 ||
767 				    q->slot.bytes_left <= 0)
768 					get_slot_next(q, now);
769 			}
770 
771 			if (q->qdisc) {
772 				unsigned int pkt_len = qdisc_pkt_len(skb);
773 				struct sk_buff *to_free = NULL;
774 				int err;
775 
776 				err = qdisc_enqueue(skb, q->qdisc, &to_free);
777 				kfree_skb_list(to_free);
778 				if (err != NET_XMIT_SUCCESS) {
779 					if (net_xmit_drop_count(err))
780 						qdisc_qstats_drop(sch);
781 					qstats_backlog_sub(sch, pkt_len);
782 					qdisc_qlen_dec(sch);
783 					qdisc_tree_reduce_backlog(sch, 1, pkt_len);
784 				}
785 				goto tfifo_dequeue;
786 			}
787 			qdisc_qlen_dec(sch);
788 			goto deliver;
789 		}
790 
791 		if (q->qdisc) {
792 			skb = q->qdisc->ops->dequeue(q->qdisc);
793 			if (skb) {
794 				qdisc_qlen_dec(sch);
795 				goto deliver;
796 			}
797 		}
798 
799 		qdisc_watchdog_schedule_ns(&q->watchdog,
800 					   max(time_to_send,
801 					       q->slot.slot_next));
802 	}
803 
804 	if (q->qdisc) {
805 		skb = q->qdisc->ops->dequeue(q->qdisc);
806 		if (skb) {
807 			qdisc_qlen_dec(sch);
808 			goto deliver;
809 		}
810 	}
811 	return NULL;
812 }
813 
814 static void netem_reset(struct Qdisc *sch)
815 {
816 	struct netem_sched_data *q = qdisc_priv(sch);
817 
818 	qdisc_reset_queue(sch);
819 	tfifo_reset(sch);
820 	if (q->qdisc)
821 		qdisc_reset(q->qdisc);
822 	qdisc_watchdog_cancel(&q->watchdog);
823 }
824 
825 static void dist_free(struct disttable *d)
826 {
827 	kvfree(d);
828 }
829 
830 /*
831  * Distribution data is a variable size payload containing
832  * signed 16 bit values.
833  */
834 
835 static int get_dist_table(struct disttable **tbl, const struct nlattr *attr)
836 {
837 	size_t n = nla_len(attr)/sizeof(__s16);
838 	const __s16 *data = nla_data(attr);
839 	struct disttable *d;
840 	int i;
841 
842 	if (!n || n > NETEM_DIST_MAX)
843 		return -EINVAL;
844 
845 	d = kvmalloc_flex(*d, table, n);
846 	if (!d)
847 		return -ENOMEM;
848 
849 	d->size = n;
850 	for (i = 0; i < n; i++)
851 		d->table[i] = data[i];
852 
853 	*tbl = d;
854 	return 0;
855 }
856 
857 static int validate_time(const struct nlattr *attr, const char *name,
858 			 struct netlink_ext_ack *extack)
859 {
860 	if (nla_get_s64(attr) < 0) {
861 		NL_SET_ERR_MSG_ATTR_FMT(extack, attr, "negative %s", name);
862 		return -EINVAL;
863 	}
864 	return 0;
865 }
866 
867 static int validate_slot(const struct nlattr *attr, struct netlink_ext_ack *extack)
868 {
869 	const struct tc_netem_slot *c = nla_data(attr);
870 
871 	if (c->min_delay < 0 || c->max_delay < 0) {
872 		NL_SET_ERR_MSG_ATTR(extack, attr, "negative slot delay");
873 		return -EINVAL;
874 	}
875 	if (c->min_delay > c->max_delay) {
876 		NL_SET_ERR_MSG_ATTR(extack, attr, "slot min delay greater than max delay");
877 		return -EINVAL;
878 	}
879 	if (c->dist_delay < 0 || c->dist_jitter < 0) {
880 		NL_SET_ERR_MSG_ATTR(extack, attr, "negative dist delay");
881 		return -EINVAL;
882 	}
883 	if (c->max_packets < 0 || c->max_bytes < 0) {
884 		NL_SET_ERR_MSG_ATTR(extack, attr, "negative slot limit");
885 		return -EINVAL;
886 	}
887 	return 0;
888 }
889 
890 static void get_slot(struct netem_sched_data *q, const struct nlattr *attr)
891 {
892 	const struct tc_netem_slot *c = nla_data(attr);
893 
894 	q->slot_config = *c;
895 	if (q->slot_config.max_packets == 0)
896 		q->slot_config.max_packets = INT_MAX;
897 	if (q->slot_config.max_bytes == 0)
898 		q->slot_config.max_bytes = INT_MAX;
899 
900 	/* capping dist_jitter to the range acceptable by tabledist() */
901 	q->slot_config.dist_jitter = min_t(__s64, INT_MAX, abs(q->slot_config.dist_jitter));
902 
903 	q->slot.packets_left = q->slot_config.max_packets;
904 	q->slot.bytes_left = q->slot_config.max_bytes;
905 	if (q->slot_config.min_delay | q->slot_config.max_delay |
906 	    q->slot_config.dist_jitter)
907 		q->slot.slot_next = ktime_get_ns();
908 	else
909 		q->slot.slot_next = 0;
910 }
911 
912 static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)
913 {
914 	const struct tc_netem_corr *c = nla_data(attr);
915 
916 	init_crandom(&q->delay_cor, c->delay_corr);
917 	init_crandom(&q->loss_cor, c->loss_corr);
918 	init_crandom(&q->dup_cor, c->dup_corr);
919 }
920 
921 static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr)
922 {
923 	const struct tc_netem_reorder *r = nla_data(attr);
924 
925 	q->reorder = r->probability;
926 	init_crandom(&q->reorder_cor, r->correlation);
927 }
928 
929 static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr)
930 {
931 	const struct tc_netem_corrupt *r = nla_data(attr);
932 
933 	q->corrupt = r->probability;
934 	init_crandom(&q->corrupt_cor, r->correlation);
935 }
936 
937 static void get_rate(struct netem_sched_data *q, const struct nlattr *attr)
938 {
939 	const struct tc_netem_rate *r = nla_data(attr);
940 
941 	q->rate = r->rate;
942 	q->packet_overhead = r->packet_overhead;
943 	q->cell_size = r->cell_size;
944 	q->cell_overhead = r->cell_overhead;
945 	if (q->cell_size)
946 		q->cell_size_reciprocal = reciprocal_value(q->cell_size);
947 	else
948 		q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
949 }
950 
951 static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr,
952 			struct netlink_ext_ack *extack)
953 {
954 	const struct nlattr *la;
955 	int rem;
956 
957 	nla_for_each_nested(la, attr, rem) {
958 		u16 type = nla_type(la);
959 
960 		switch (type) {
961 		case NETEM_LOSS_GI: {
962 			const struct tc_netem_gimodel *gi = nla_data(la);
963 
964 			if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
965 				NL_SET_ERR_MSG_ATTR(extack, la,
966 						    "netem: incorrect gi model size");
967 				return -EINVAL;
968 			}
969 
970 			q->loss_model = CLG_4_STATES;
971 
972 			q->clg.state = TX_IN_GAP_PERIOD;
973 			q->clg.a1 = gi->p13;
974 			q->clg.a2 = gi->p31;
975 			q->clg.a3 = gi->p32;
976 			q->clg.a4 = gi->p14;
977 			q->clg.a5 = gi->p23;
978 			break;
979 		}
980 
981 		case NETEM_LOSS_GE: {
982 			const struct tc_netem_gemodel *ge = nla_data(la);
983 
984 			if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
985 				NL_SET_ERR_MSG_ATTR(extack, la,
986 						    "netem: incorrect ge model size");
987 				return -EINVAL;
988 			}
989 
990 			q->loss_model = CLG_GILB_ELL;
991 			q->clg.state = GOOD_STATE;
992 			q->clg.a1 = ge->p;
993 			q->clg.a2 = ge->r;
994 			q->clg.a3 = ge->h;
995 			q->clg.a4 = ge->k1;
996 			break;
997 		}
998 
999 		default:
1000 			NL_SET_ERR_MSG_ATTR_FMT(extack, la,
1001 						"netem: unknown loss type %u", type);
1002 			return -EINVAL;
1003 		}
1004 	}
1005 
1006 	return 0;
1007 }
1008 
1009 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
1010 	[TCA_NETEM_CORR]	= { .len = sizeof(struct tc_netem_corr) },
1011 	[TCA_NETEM_REORDER]	= { .len = sizeof(struct tc_netem_reorder) },
1012 	[TCA_NETEM_CORRUPT]	= { .len = sizeof(struct tc_netem_corrupt) },
1013 	[TCA_NETEM_RATE]	= { .len = sizeof(struct tc_netem_rate) },
1014 	[TCA_NETEM_LOSS]	= { .type = NLA_NESTED },
1015 	[TCA_NETEM_ECN]		= { .type = NLA_U32 },
1016 	[TCA_NETEM_RATE64]	= { .type = NLA_U64 },
1017 	[TCA_NETEM_LATENCY64]	= { .type = NLA_S64 },
1018 	[TCA_NETEM_JITTER64]	= { .type = NLA_S64 },
1019 	[TCA_NETEM_SLOT]	= { .len = sizeof(struct tc_netem_slot) },
1020 	[TCA_NETEM_PRNG_SEED]	= { .type = NLA_U64 },
1021 };
1022 
1023 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
1024 		      const struct nla_policy *policy, int len,
1025 		      struct netlink_ext_ack *extack)
1026 {
1027 	int nested_len = nla_len(nla) - NLA_ALIGN(len);
1028 
1029 	if (nested_len < 0) {
1030 		NL_SET_ERR_MSG_FMT(extack, "netem: invalid attributes len %d < %d",
1031 				   nla_len(nla), NLA_ALIGN(len));
1032 		return -EINVAL;
1033 	}
1034 
1035 	if (nested_len >= nla_attr_size(0))
1036 		return nla_parse_deprecated(tb, maxtype,
1037 					    nla_data(nla) + NLA_ALIGN(len),
1038 					    nested_len, policy, extack);
1039 
1040 	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
1041 	return 0;
1042 }
1043 
1044 static const struct Qdisc_class_ops netem_class_ops;
1045 
1046 static int check_netem_in_tree(struct Qdisc *sch, bool duplicates,
1047 			       struct netlink_ext_ack *extack)
1048 {
1049 	struct Qdisc *root, *q;
1050 	unsigned int i;
1051 
1052 	root = qdisc_root_sleeping(sch);
1053 
1054 	if (sch != root && root->ops->cl_ops == &netem_class_ops) {
1055 		if (duplicates ||
1056 		    ((struct netem_sched_data *)qdisc_priv(root))->duplicate)
1057 			goto err;
1058 	}
1059 
1060 	if (!qdisc_dev(root))
1061 		return 0;
1062 
1063 	hash_for_each(qdisc_dev(root)->qdisc_hash, i, q, hash) {
1064 		if (sch != q && q->ops->cl_ops == &netem_class_ops) {
1065 			if (duplicates ||
1066 			    ((struct netem_sched_data *)qdisc_priv(q))->duplicate)
1067 				goto err;
1068 		}
1069 	}
1070 
1071 	return 0;
1072 
1073 err:
1074 	NL_SET_ERR_MSG(extack,
1075 		       "netem: cannot mix duplicating netems with other netems in tree");
1076 	return -EINVAL;
1077 }
1078 
1079 /* Parse netlink message to set options */
1080 static int netem_change(struct Qdisc *sch, struct nlattr *opt,
1081 			struct netlink_ext_ack *extack)
1082 {
1083 	struct netem_sched_data *q = qdisc_priv(sch);
1084 	struct nlattr *tb[TCA_NETEM_MAX + 1];
1085 	struct disttable *delay_dist = NULL;
1086 	struct disttable *slot_dist = NULL;
1087 	struct tc_netem_qopt *qopt;
1088 	struct clgstate old_clg;
1089 	int old_loss_model = CLG_RANDOM;
1090 	int ret;
1091 
1092 	qopt = nla_data(opt);
1093 	ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt), extack);
1094 	if (ret < 0)
1095 		return ret;
1096 
1097 	if (tb[TCA_NETEM_DELAY_DIST]) {
1098 		ret = get_dist_table(&delay_dist, tb[TCA_NETEM_DELAY_DIST]);
1099 		if (ret)
1100 			goto table_free;
1101 	}
1102 
1103 	if (tb[TCA_NETEM_SLOT_DIST]) {
1104 		ret = get_dist_table(&slot_dist, tb[TCA_NETEM_SLOT_DIST]);
1105 		if (ret)
1106 			goto table_free;
1107 	}
1108 
1109 	if (tb[TCA_NETEM_SLOT]) {
1110 		ret = validate_slot(tb[TCA_NETEM_SLOT], extack);
1111 		if (ret)
1112 			goto table_free;
1113 	}
1114 
1115 	if (tb[TCA_NETEM_LATENCY64]) {
1116 		ret = validate_time(tb[TCA_NETEM_LATENCY64], "latency", extack);
1117 		if (ret)
1118 			goto table_free;
1119 	}
1120 
1121 	if (tb[TCA_NETEM_JITTER64]) {
1122 		ret = validate_time(tb[TCA_NETEM_JITTER64], "jitter", extack);
1123 		if (ret)
1124 			goto table_free;
1125 	}
1126 
1127 	sch_tree_lock(sch);
1128 	/* backup q->clg and q->loss_model */
1129 	old_clg = q->clg;
1130 	old_loss_model = q->loss_model;
1131 
1132 	if (tb[TCA_NETEM_LOSS]) {
1133 		ret = get_loss_clg(q, tb[TCA_NETEM_LOSS], extack);
1134 		if (ret) {
1135 			q->loss_model = old_loss_model;
1136 			q->clg = old_clg;
1137 			goto unlock;
1138 		}
1139 	} else {
1140 		q->loss_model = CLG_RANDOM;
1141 	}
1142 
1143 	if (delay_dist)
1144 		swap(q->delay_dist, delay_dist);
1145 	if (slot_dist)
1146 		swap(q->slot_dist, slot_dist);
1147 	sch->limit = qopt->limit;
1148 
1149 	q->latency = PSCHED_TICKS2NS(qopt->latency);
1150 	q->jitter = PSCHED_TICKS2NS(qopt->jitter);
1151 	q->limit = qopt->limit;
1152 	q->gap = qopt->gap;
1153 	q->counter = 0;
1154 	q->loss = qopt->loss;
1155 
1156 	ret = check_netem_in_tree(sch, qopt->duplicate, extack);
1157 	if (ret)
1158 		goto unlock;
1159 
1160 	q->duplicate = qopt->duplicate;
1161 
1162 	/* for compatibility with earlier versions.
1163 	 * if gap is set, need to assume 100% probability
1164 	 */
1165 	if (q->gap)
1166 		q->reorder = ~0;
1167 
1168 	if (tb[TCA_NETEM_CORR])
1169 		get_correlation(q, tb[TCA_NETEM_CORR]);
1170 
1171 	if (tb[TCA_NETEM_REORDER])
1172 		get_reorder(q, tb[TCA_NETEM_REORDER]);
1173 
1174 	if (tb[TCA_NETEM_CORRUPT])
1175 		get_corrupt(q, tb[TCA_NETEM_CORRUPT]);
1176 
1177 	if (tb[TCA_NETEM_RATE])
1178 		get_rate(q, tb[TCA_NETEM_RATE]);
1179 
1180 	if (tb[TCA_NETEM_RATE64])
1181 		q->rate = max_t(u64, q->rate,
1182 				nla_get_u64(tb[TCA_NETEM_RATE64]));
1183 
1184 	if (tb[TCA_NETEM_LATENCY64])
1185 		q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]);
1186 
1187 	if (tb[TCA_NETEM_JITTER64])
1188 		q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]);
1189 
1190 	if (tb[TCA_NETEM_ECN])
1191 		q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
1192 
1193 	if (tb[TCA_NETEM_SLOT])
1194 		get_slot(q, tb[TCA_NETEM_SLOT]);
1195 
1196 	/* capping jitter to the range acceptable by tabledist() */
1197 	q->jitter = min_t(s64, abs(q->jitter), INT_MAX);
1198 
1199 	if (tb[TCA_NETEM_PRNG_SEED]) {
1200 		q->prng.seed = nla_get_u64(tb[TCA_NETEM_PRNG_SEED]);
1201 		prandom_seed_state(&q->prng.prng_state, q->prng.seed);
1202 	}
1203 
1204 unlock:
1205 	sch_tree_unlock(sch);
1206 
1207 table_free:
1208 	dist_free(delay_dist);
1209 	dist_free(slot_dist);
1210 	return ret;
1211 }
1212 
1213 static int netem_init(struct Qdisc *sch, struct nlattr *opt,
1214 		      struct netlink_ext_ack *extack)
1215 {
1216 	struct netem_sched_data *q = qdisc_priv(sch);
1217 
1218 	qdisc_watchdog_init(&q->watchdog, sch);
1219 
1220 	if (!opt)
1221 		return -EINVAL;
1222 
1223 	q->loss_model = CLG_RANDOM;
1224 	q->prng.seed = get_random_u64();
1225 	prandom_seed_state(&q->prng.prng_state, q->prng.seed);
1226 
1227 	return netem_change(sch, opt, extack);
1228 }
1229 
1230 static void netem_destroy(struct Qdisc *sch)
1231 {
1232 	struct netem_sched_data *q = qdisc_priv(sch);
1233 
1234 	qdisc_watchdog_cancel(&q->watchdog);
1235 	if (q->qdisc)
1236 		qdisc_put(q->qdisc);
1237 	dist_free(q->delay_dist);
1238 	dist_free(q->slot_dist);
1239 }
1240 
1241 static int dump_loss_model(const struct netem_sched_data *q,
1242 			   struct sk_buff *skb)
1243 {
1244 	struct nlattr *nest;
1245 
1246 	nest = nla_nest_start_noflag(skb, TCA_NETEM_LOSS);
1247 	if (nest == NULL)
1248 		goto nla_put_failure;
1249 
1250 	switch (q->loss_model) {
1251 	case CLG_RANDOM:
1252 		/* legacy loss model */
1253 		nla_nest_cancel(skb, nest);
1254 		return 0;	/* no data */
1255 
1256 	case CLG_4_STATES: {
1257 		struct tc_netem_gimodel gi = {
1258 			.p13 = q->clg.a1,
1259 			.p31 = q->clg.a2,
1260 			.p32 = q->clg.a3,
1261 			.p14 = q->clg.a4,
1262 			.p23 = q->clg.a5,
1263 		};
1264 
1265 		if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
1266 			goto nla_put_failure;
1267 		break;
1268 	}
1269 	case CLG_GILB_ELL: {
1270 		struct tc_netem_gemodel ge = {
1271 			.p = q->clg.a1,
1272 			.r = q->clg.a2,
1273 			.h = q->clg.a3,
1274 			.k1 = q->clg.a4,
1275 		};
1276 
1277 		if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
1278 			goto nla_put_failure;
1279 		break;
1280 	}
1281 	}
1282 
1283 	nla_nest_end(skb, nest);
1284 	return 0;
1285 
1286 nla_put_failure:
1287 	nla_nest_cancel(skb, nest);
1288 	return -1;
1289 }
1290 
1291 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
1292 {
1293 	const struct netem_sched_data *q = qdisc_priv(sch);
1294 	struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
1295 	struct tc_netem_qopt qopt;
1296 	struct tc_netem_corr cor;
1297 	struct tc_netem_reorder reorder;
1298 	struct tc_netem_corrupt corrupt;
1299 	struct tc_netem_rate rate;
1300 	struct tc_netem_slot slot;
1301 
1302 	qopt.latency = min_t(psched_time_t, PSCHED_NS2TICKS(q->latency),
1303 			     UINT_MAX);
1304 	qopt.jitter = min_t(psched_time_t, PSCHED_NS2TICKS(q->jitter),
1305 			    UINT_MAX);
1306 	qopt.limit = q->limit;
1307 	qopt.loss = q->loss;
1308 	qopt.gap = q->gap;
1309 	qopt.duplicate = q->duplicate;
1310 	if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
1311 		goto nla_put_failure;
1312 
1313 	if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency))
1314 		goto nla_put_failure;
1315 
1316 	if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter))
1317 		goto nla_put_failure;
1318 
1319 	cor.delay_corr = q->delay_cor.rho;
1320 	cor.loss_corr = q->loss_cor.rho;
1321 	cor.dup_corr = q->dup_cor.rho;
1322 	if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
1323 		goto nla_put_failure;
1324 
1325 	reorder.probability = q->reorder;
1326 	reorder.correlation = q->reorder_cor.rho;
1327 	if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
1328 		goto nla_put_failure;
1329 
1330 	corrupt.probability = q->corrupt;
1331 	corrupt.correlation = q->corrupt_cor.rho;
1332 	if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
1333 		goto nla_put_failure;
1334 
1335 	if (q->rate >= (1ULL << 32)) {
1336 		if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate,
1337 				      TCA_NETEM_PAD))
1338 			goto nla_put_failure;
1339 		rate.rate = ~0U;
1340 	} else {
1341 		rate.rate = q->rate;
1342 	}
1343 	rate.packet_overhead = q->packet_overhead;
1344 	rate.cell_size = q->cell_size;
1345 	rate.cell_overhead = q->cell_overhead;
1346 	if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
1347 		goto nla_put_failure;
1348 
1349 	if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
1350 		goto nla_put_failure;
1351 
1352 	if (dump_loss_model(q, skb) != 0)
1353 		goto nla_put_failure;
1354 
1355 	if (q->slot_config.min_delay | q->slot_config.max_delay |
1356 	    q->slot_config.dist_jitter) {
1357 		slot = q->slot_config;
1358 		if (slot.max_packets == INT_MAX)
1359 			slot.max_packets = 0;
1360 		if (slot.max_bytes == INT_MAX)
1361 			slot.max_bytes = 0;
1362 		if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot))
1363 			goto nla_put_failure;
1364 	}
1365 
1366 	if (nla_put_u64_64bit(skb, TCA_NETEM_PRNG_SEED, q->prng.seed,
1367 			      TCA_NETEM_PAD))
1368 		goto nla_put_failure;
1369 
1370 	return nla_nest_end(skb, nla);
1371 
1372 nla_put_failure:
1373 	nlmsg_trim(skb, nla);
1374 	return -1;
1375 }
1376 
1377 static int netem_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1378 {
1379 	struct netem_sched_data *q = qdisc_priv(sch);
1380 	struct tc_netem_xstats st = {
1381 		.delayed    = READ_ONCE(q->delayed),
1382 		.dropped    = READ_ONCE(q->dropped),
1383 		.corrupted  = READ_ONCE(q->corrupted),
1384 		.duplicated = READ_ONCE(q->duplicated),
1385 		.reordered  = READ_ONCE(q->reordered),
1386 		.ecn_marked = READ_ONCE(q->ecn_marked),
1387 		.allocation_errors = READ_ONCE(q->allocation_errors),
1388 	};
1389 
1390 	return gnet_stats_copy_app(d, &st, sizeof(st));
1391 }
1392 
1393 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
1394 			  struct sk_buff *skb, struct tcmsg *tcm)
1395 {
1396 	struct netem_sched_data *q = qdisc_priv(sch);
1397 
1398 	if (cl != 1 || !q->qdisc) 	/* only one class */
1399 		return -ENOENT;
1400 
1401 	tcm->tcm_handle |= TC_H_MIN(1);
1402 	tcm->tcm_info = q->qdisc->handle;
1403 
1404 	return 0;
1405 }
1406 
1407 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1408 		     struct Qdisc **old, struct netlink_ext_ack *extack)
1409 {
1410 	struct netem_sched_data *q = qdisc_priv(sch);
1411 
1412 	*old = qdisc_replace(sch, new, &q->qdisc);
1413 	return 0;
1414 }
1415 
1416 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
1417 {
1418 	struct netem_sched_data *q = qdisc_priv(sch);
1419 	return q->qdisc;
1420 }
1421 
1422 static unsigned long netem_find(struct Qdisc *sch, u32 classid)
1423 {
1424 	return 1;
1425 }
1426 
1427 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
1428 {
1429 	if (!walker->stop) {
1430 		if (!tc_qdisc_stats_dump(sch, 1, walker))
1431 			return;
1432 	}
1433 }
1434 
1435 static const struct Qdisc_class_ops netem_class_ops = {
1436 	.graft		=	netem_graft,
1437 	.leaf		=	netem_leaf,
1438 	.find		=	netem_find,
1439 	.walk		=	netem_walk,
1440 	.dump		=	netem_dump_class,
1441 };
1442 
1443 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
1444 	.id		=	"netem",
1445 	.cl_ops		=	&netem_class_ops,
1446 	.priv_size	=	sizeof(struct netem_sched_data),
1447 	.enqueue	=	netem_enqueue,
1448 	.dequeue	=	netem_dequeue,
1449 	.peek		=	qdisc_peek_dequeued,
1450 	.init		=	netem_init,
1451 	.reset		=	netem_reset,
1452 	.destroy	=	netem_destroy,
1453 	.change		=	netem_change,
1454 	.dump		=	netem_dump,
1455 	.dump_stats	=	netem_dump_stats,
1456 	.owner		=	THIS_MODULE,
1457 };
1458 MODULE_ALIAS_NET_SCH("netem");
1459 
1460 static int __init netem_module_init(void)
1461 {
1462 	return register_qdisc(&netem_qdisc_ops);
1463 }
1464 static void __exit netem_module_exit(void)
1465 {
1466 	unregister_qdisc(&netem_qdisc_ops);
1467 }
1468 module_init(netem_module_init)
1469 module_exit(netem_module_exit)
1470 MODULE_LICENSE("GPL");
1471 MODULE_DESCRIPTION("Network characteristics emulator qdisc");
1472