xref: /linux/net/sched/sch_netem.c (revision d44646fc9eeb423ad50f3043f11f66f491d908a7)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * net/sched/sch_netem.c	Network emulator
4  *
5  *  		Many of the algorithms and ideas for this came from
6  *		NIST Net which is not copyrighted.
7  *
8  * Authors:	Stephen Hemminger <shemminger@osdl.org>
9  *		Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
10  */
11 
12 #include <linux/mm.h>
13 #include <linux/module.h>
14 #include <linux/slab.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/errno.h>
18 #include <linux/skbuff.h>
19 #include <linux/vmalloc.h>
20 #include <linux/prandom.h>
21 #include <linux/rtnetlink.h>
22 #include <linux/reciprocal_div.h>
23 #include <linux/rbtree.h>
24 
25 #include <net/gso.h>
26 #include <net/netlink.h>
27 #include <net/pkt_sched.h>
28 #include <net/inet_ecn.h>
29 
30 /*	Network Emulation Queuing algorithm.
31 	====================================
32 
33 	Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
34 		 Network Emulation Tool
35 		 [2] Luigi Rizzo, DummyNet for FreeBSD
36 
37 	 ----------------------------------------------------------------
38 
39 	 This started out as a simple way to delay outgoing packets to
40 	 test TCP but has grown to include most of the functionality
41 	 of a full blown network emulator like NISTnet. It can delay
42 	 packets and add random jitter (and correlation). The random
43 	 distribution can be loaded from a table as well to provide
44 	 normal, Pareto, or experimental curves. Packet loss,
45 	 duplication, and reordering can also be emulated.
46 
47 	 This qdisc does not do classification that can be handled in
48 	 layering other disciplines.  It does not need to do bandwidth
49 	 control either since that can be handled by using token
50 	 bucket or other rate control.
51 
52      Correlated Loss Generator models
53 
54 	Added generation of correlated loss according to the
55 	"Gilbert-Elliot" model, a 4-state markov model.
56 
57 	References:
58 	[1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
59 	[2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
60 	and intuitive loss model for packet networks and its implementation
61 	in the Netem module in the Linux kernel", available in [1]
62 
63 	Authors: Stefano Salsano <stefano.salsano at uniroma2.it
64 		 Fabio Ludovici <fabio.ludovici at yahoo.it>
65 */
66 
67 struct disttable {
68 	u32  size;
69 	s16 table[] __counted_by(size);
70 };
71 
72 /* Loss models */
73 enum {
74 	CLG_RANDOM,
75 	CLG_4_STATES,
76 	CLG_GILB_ELL,
77 };
78 
79 /* States in GE model */
80 enum {
81 	GOOD_STATE = 1,
82 	BAD_STATE,
83 };
84 
85 /* States in 4 state model */
86 enum {
87 	TX_IN_GAP_PERIOD = 1,
88 	TX_IN_BURST_PERIOD,
89 	LOST_IN_GAP_PERIOD,
90 	LOST_IN_BURST_PERIOD,
91 };
92 
93 struct netem_sched_data {
94 	/* Cacheline 0: tfifo state and per-packet enqueue/dequeue scalars. */
95 	struct rb_root		t_root;
96 	struct sk_buff		*t_head;
97 	struct sk_buff		*t_tail;
98 	u32			t_len;
99 	u32			counter;
100 	s64			latency;
101 	s64			jitter;
102 	u64			rate;
103 	u32			gap;
104 	u32			loss;
105 
106 	/* Cacheline 1: zero-check scalars and correlation states. */
107 	u32			duplicate;
108 	u32			reorder;
109 	u32			corrupt;
110 	u32			ecn;
111 	struct crndstate {
112 		u32 last;
113 		u32 rho;
114 	} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
115 	u8			loss_model;
116 
117 	/* Cacheline 2: PRNG, distribution tables, slot dequeue state etc. */
118 	struct prng {
119 		u64 seed;
120 		struct rnd_state prng_state;
121 	} prng;
122 	struct disttable	*delay_dist;
123 	struct slotstate {
124 		u64 slot_next;
125 		s32 packets_left;
126 		s32 bytes_left;
127 	} slot;
128 	struct disttable	*slot_dist;
129 	struct Qdisc		*qdisc;
130 
131 	/*
132 	 * Warm: rate-shaping parameters (only read when rate != 0) and
133 	 * configuration-only fields.  The fast path reads sch->limit, not
134 	 * q->limit.
135 	 */
136 	s32			packet_overhead;
137 	u32			cell_size;
138 	struct reciprocal_value	cell_size_reciprocal;
139 	s32			cell_overhead;
140 	u32			limit;
141 
142 	/* Correlated Loss Generation models */
143 	struct clgstate {
144 		/* 4-states and Gilbert-Elliot models */
145 		u32 a1;	/* p13 for 4-states or p for GE */
146 		u32 a2;	/* p31 for 4-states or r for GE */
147 		u32 a3;	/* p32 for 4-states or h for GE */
148 		u32 a4;	/* p14 for 4-states or 1-k for GE */
149 		u32 a5; /* p23 used only in 4-states */
150 
151 		/* state of the Markov chain */
152 		u8  state;
153 	} clg;
154 
155 	/* Impairment counters */
156 	u64			delayed;
157 	u64			dropped;
158 	u64			corrupted;
159 	u64			duplicated;
160 	u64			ecn_marked;
161 	u64			reordered;
162 	u64			allocation_errors;
163 
164 	/* Cold tail: slot reschedule config and the watchdog timer. */
165 	struct tc_netem_slot	slot_config;
166 	struct qdisc_watchdog	watchdog;
167 };
168 
169 /* Time stamp put into socket buffer control block
170  * Only valid when skbs are in our internal t(ime)fifo queue.
171  *
172  * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp,
173  * and skb->next & skb->prev are scratch space for a qdisc,
174  * we save skb->tstamp value in skb->cb[] before destroying it.
175  */
176 struct netem_skb_cb {
177 	u64	        time_to_send;
178 };
179 
180 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
181 {
182 	/* we assume we can use skb next/prev/tstamp as storage for rb_node */
183 	qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
184 	return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
185 }
186 
187 /* init_crandom - initialize correlated random number generator
188  * Use entropy source for initial seed.
189  */
190 static void init_crandom(struct crndstate *state, unsigned long rho)
191 {
192 	state->rho = rho;
193 	state->last = get_random_u32();
194 }
195 
196 /* get_crandom - correlated random number generator
197  * Next number depends on last value.
198  * rho is scaled to avoid floating point.
199  */
200 static u32 get_crandom(struct crndstate *state, struct prng *p)
201 {
202 	u64 value, rho;
203 	unsigned long answer;
204 	struct rnd_state *s = &p->prng_state;
205 
206 	if (!state || state->rho == 0)	/* no correlation */
207 		return prandom_u32_state(s);
208 
209 	value = prandom_u32_state(s);
210 	rho = (u64)state->rho + 1;
211 	answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
212 	state->last = answer;
213 	return answer;
214 }
215 
216 /* loss_4state - 4-state model loss generator
217  * Generates losses according to the 4-state Markov chain adopted in
218  * the GI (General and Intuitive) loss model.
219  */
220 static bool loss_4state(struct netem_sched_data *q)
221 {
222 	struct clgstate *clg = &q->clg;
223 	u32 rnd = prandom_u32_state(&q->prng.prng_state);
224 
225 	/*
226 	 * Makes a comparison between rnd and the transition
227 	 * probabilities outgoing from the current state, then decides the
228 	 * next state and if the next packet has to be transmitted or lost.
229 	 * The four states correspond to:
230 	 *   TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
231 	 *   LOST_IN_GAP_PERIOD => isolated losses within a gap period
232 	 *   LOST_IN_BURST_PERIOD => lost packets within a burst period
233 	 *   TX_IN_BURST_PERIOD => successfully transmitted packets within a burst period
234 	 */
235 	switch (clg->state) {
236 	case TX_IN_GAP_PERIOD:
237 		if (rnd < clg->a4) {
238 			clg->state = LOST_IN_GAP_PERIOD;
239 			return true;
240 		} else if (rnd < clg->a1 + clg->a4) {
241 			clg->state = LOST_IN_BURST_PERIOD;
242 			return true;
243 		} else {
244 			clg->state = TX_IN_GAP_PERIOD;
245 		}
246 
247 		break;
248 	case TX_IN_BURST_PERIOD:
249 		if (rnd < clg->a5) {
250 			clg->state = LOST_IN_BURST_PERIOD;
251 			return true;
252 		} else {
253 			clg->state = TX_IN_BURST_PERIOD;
254 		}
255 
256 		break;
257 	case LOST_IN_BURST_PERIOD:
258 		if (rnd < clg->a3)
259 			clg->state = TX_IN_BURST_PERIOD;
260 		else if (rnd < clg->a2 + clg->a3) {
261 			clg->state = TX_IN_GAP_PERIOD;
262 		} else {
263 			clg->state = LOST_IN_BURST_PERIOD;
264 			return true;
265 		}
266 		break;
267 	case LOST_IN_GAP_PERIOD:
268 		clg->state = TX_IN_GAP_PERIOD;
269 		break;
270 	}
271 
272 	return false;
273 }
274 
275 /* loss_gilb_ell - Gilbert-Elliot model loss generator
276  * Generates losses according to the Gilbert-Elliot loss model or
277  * its special cases  (Gilbert or Simple Gilbert)
278  *
279  * Makes a comparison between random number and the transition
280  * probabilities outgoing from the current state, then decides the
281  * next state. A second random number is extracted and the comparison
282  * with the loss probability of the current state decides if the next
283  * packet will be transmitted or lost.
284  */
285 static bool loss_gilb_ell(struct netem_sched_data *q)
286 {
287 	struct clgstate *clg = &q->clg;
288 	struct rnd_state *s = &q->prng.prng_state;
289 
290 	switch (clg->state) {
291 	case GOOD_STATE:
292 		if (prandom_u32_state(s) < clg->a1)
293 			clg->state = BAD_STATE;
294 		if (prandom_u32_state(s) < clg->a4)
295 			return true;
296 		break;
297 	case BAD_STATE:
298 		if (prandom_u32_state(s) < clg->a2)
299 			clg->state = GOOD_STATE;
300 		if (prandom_u32_state(s) > clg->a3)
301 			return true;
302 	}
303 
304 	return false;
305 }
306 
307 static bool loss_event(struct netem_sched_data *q)
308 {
309 	switch (q->loss_model) {
310 	case CLG_RANDOM:
311 		/* Random packet drop 0 => none, ~0 => all */
312 		return q->loss && q->loss >= get_crandom(&q->loss_cor, &q->prng);
313 
314 	case CLG_4_STATES:
315 		/* 4state loss model algorithm (used also for GI model)
316 		* Extracts a value from the markov 4 state loss generator,
317 		* if it is 1 drops a packet and if needed writes the event in
318 		* the kernel logs
319 		*/
320 		return loss_4state(q);
321 
322 	case CLG_GILB_ELL:
323 		/* Gilbert-Elliot loss model algorithm
324 		* Extracts a value from the Gilbert-Elliot loss generator,
325 		* if it is 1 drops a packet and if needed writes the event in
326 		* the kernel logs
327 		*/
328 		return loss_gilb_ell(q);
329 	}
330 
331 	return false;	/* not reached */
332 }
333 
334 
335 /* tabledist - return a pseudo-randomly distributed value with mean mu and
336  * std deviation sigma.  Uses table lookup to approximate the desired
337  * distribution, and a uniformly-distributed pseudo-random source.
338  */
339 static s64 tabledist(s64 mu, s32 sigma,
340 		     struct crndstate *state,
341 		     struct prng *prng,
342 		     const struct disttable *dist)
343 {
344 	s64 x;
345 	long t;
346 	u32 rnd;
347 
348 	if (sigma == 0)
349 		return mu;
350 
351 	rnd = get_crandom(state, prng);
352 
353 	/* default uniform distribution */
354 	if (dist == NULL)
355 		return ((rnd % (2 * (u32)sigma)) + mu) - sigma;
356 
357 	t = dist->table[rnd % dist->size];
358 	x = (sigma % NETEM_DIST_SCALE) * t;
359 	if (x >= 0)
360 		x += NETEM_DIST_SCALE/2;
361 	else
362 		x -= NETEM_DIST_SCALE/2;
363 
364 	return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
365 }
366 
367 static u64 packet_time_ns(u64 len, const struct netem_sched_data *q)
368 {
369 	len += q->packet_overhead;
370 
371 	if (q->cell_size) {
372 		u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
373 
374 		if (len > cells * q->cell_size)	/* extra cell needed for remainder */
375 			cells++;
376 		len = cells * (q->cell_size + q->cell_overhead);
377 	}
378 
379 	return div64_u64(len * NSEC_PER_SEC, q->rate);
380 }
381 
382 static void tfifo_reset(struct Qdisc *sch)
383 {
384 	struct netem_sched_data *q = qdisc_priv(sch);
385 	struct rb_node *p = rb_first(&q->t_root);
386 
387 	while (p) {
388 		struct sk_buff *skb = rb_to_skb(p);
389 
390 		p = rb_next(p);
391 		rb_erase(&skb->rbnode, &q->t_root);
392 		rtnl_kfree_skbs(skb, skb);
393 	}
394 
395 	rtnl_kfree_skbs(q->t_head, q->t_tail);
396 	q->t_head = NULL;
397 	q->t_tail = NULL;
398 	q->t_len = 0;
399 }
400 
401 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
402 {
403 	struct netem_sched_data *q = qdisc_priv(sch);
404 	u64 tnext = netem_skb_cb(nskb)->time_to_send;
405 
406 	if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) {
407 		if (q->t_tail)
408 			q->t_tail->next = nskb;
409 		else
410 			q->t_head = nskb;
411 		q->t_tail = nskb;
412 	} else {
413 		struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
414 
415 		while (*p) {
416 			struct sk_buff *skb;
417 
418 			parent = *p;
419 			skb = rb_to_skb(parent);
420 			if (tnext >= netem_skb_cb(skb)->time_to_send)
421 				p = &parent->rb_right;
422 			else
423 				p = &parent->rb_left;
424 		}
425 		rb_link_node(&nskb->rbnode, parent, p);
426 		rb_insert_color(&nskb->rbnode, &q->t_root);
427 	}
428 	q->t_len++;
429 	qdisc_qlen_inc(sch);
430 }
431 
432 /* netem can't properly corrupt a megapacket (like we get from GSO), so instead
433  * when we statistically choose to corrupt one, we instead segment it, returning
434  * the first packet to be corrupted, and re-enqueue the remaining frames
435  */
436 static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
437 				     struct sk_buff **to_free)
438 {
439 	struct sk_buff *segs;
440 	netdev_features_t features = netif_skb_features(skb);
441 
442 	qdisc_skb_cb(skb)->pkt_segs = 1;
443 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
444 
445 	if (IS_ERR_OR_NULL(segs)) {
446 		qdisc_drop(skb, sch, to_free);
447 		return NULL;
448 	}
449 	consume_skb(skb);
450 	return segs;
451 }
452 
453 /*
454  * Insert one skb into qdisc.
455  * Note: parent depends on return value to account for queue length.
456  * 	NET_XMIT_DROP: queue length didn't change.
457  *      NET_XMIT_SUCCESS: one skb was queued.
458  */
459 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
460 			 struct sk_buff **to_free)
461 {
462 	struct netem_sched_data *q = qdisc_priv(sch);
463 	/* We don't fill cb now as skb_unshare() may invalidate it */
464 	struct netem_skb_cb *cb;
465 	struct sk_buff *skb2 = NULL;
466 	struct sk_buff *segs = NULL;
467 	unsigned int prev_len = qdisc_pkt_len(skb);
468 	int count = 1;
469 
470 	/* Do not fool qdisc_drop_all() */
471 	skb->prev = NULL;
472 
473 	/* Random duplication */
474 	if (q->duplicate && skb->tc_depth == 0 &&
475 	    q->duplicate >= get_crandom(&q->dup_cor, &q->prng)) {
476 		++count;
477 		WRITE_ONCE(q->duplicated, q->duplicated + 1);
478 	}
479 
480 	/* Drop packet? */
481 	if (loss_event(q)) {
482 		if (q->ecn && INET_ECN_set_ce(skb)) {
483 			WRITE_ONCE(q->ecn_marked, q->ecn_marked + 1);
484 		} else {
485 			WRITE_ONCE(q->dropped, q->dropped + 1);
486 			--count;
487 		}
488 	}
489 
490 	if (count == 0) {
491 		qdisc_qstats_drop(sch);
492 		__qdisc_drop(skb, to_free);
493 		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
494 	}
495 
496 	/* If a delay is expected, orphan the skb. (orphaning usually takes
497 	 * place at TX completion time, so _before_ the link transit delay)
498 	 */
499 	if (q->latency || q->jitter || q->rate)
500 		skb_orphan_partial(skb);
501 
502 	/*
503 	 * If we need to duplicate packet, then clone it before
504 	 * original is modified.
505 	 */
506 	if (count > 1) {
507 		skb2 = skb_clone(skb, GFP_ATOMIC);
508 		if (!skb2)
509 			WRITE_ONCE(q->allocation_errors, q->allocation_errors + 1);
510 	}
511 
512 	/*
513 	 * Randomized packet corruption.
514 	 * Make copy if needed since we are modifying
515 	 * If packet is going to be hardware checksummed, then
516 	 * do it now in software before we mangle it.
517 	 */
518 	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor, &q->prng)) {
519 		if (skb_is_gso(skb)) {
520 			skb = netem_segment(skb, sch, to_free);
521 			if (!skb) {
522 				WRITE_ONCE(q->allocation_errors, q->allocation_errors + 1);
523 				goto finish_segs;
524 			}
525 
526 			segs = skb->next;
527 			skb_mark_not_on_list(skb);
528 			qdisc_skb_cb(skb)->pkt_len = skb->len;
529 		}
530 
531 		skb = skb_unshare(skb, GFP_ATOMIC);
532 		if (unlikely(!skb)) {
533 			WRITE_ONCE(q->allocation_errors, q->allocation_errors + 1);
534 			qdisc_qstats_drop(sch);
535 			goto finish_segs;
536 		}
537 		if (skb_linearize(skb) ||
538 		    (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))) {
539 			WRITE_ONCE(q->allocation_errors, q->allocation_errors + 1);
540 			qdisc_drop(skb, sch, to_free);
541 			skb = NULL;
542 			goto finish_segs;
543 		}
544 
545 		if (skb->len) {
546 			u32 offset = get_random_u32_below(skb->len);
547 			skb->data[offset] ^= 1 << get_random_u32_below(8);
548 			WRITE_ONCE(q->corrupted, q->corrupted + 1);
549 		}
550 	}
551 
552 	if (unlikely(sch->q.qlen >= sch->limit)) {
553 		/* re-link segs, so that qdisc_drop_all() frees them all */
554 		skb->next = segs;
555 		qdisc_drop_all(skb, sch, to_free);
556 		if (skb2)
557 			__qdisc_drop(skb2, to_free);
558 		return NET_XMIT_DROP;
559 	}
560 
561 	/*
562 	 * If doing duplication then re-insert at top of the
563 	 * qdisc tree, since parent queuer expects that only one
564 	 * skb will be queued.
565 	 */
566 	if (skb2) {
567 		struct Qdisc *rootq = qdisc_root_bh(sch);
568 
569 		skb2->tc_depth++; /* prevent duplicating a dup... */
570 		rootq->enqueue(skb2, rootq, to_free);
571 		skb2 = NULL;
572 	}
573 
574 	qdisc_qstats_backlog_inc(sch, skb);
575 
576 	cb = netem_skb_cb(skb);
577 	if (q->gap == 0 ||		/* not doing reordering */
578 	    q->counter < q->gap - 1 ||	/* inside last reordering gap */
579 	    q->reorder < get_crandom(&q->reorder_cor, &q->prng)) {
580 		u64 now;
581 		s64 delay;
582 
583 		delay = tabledist(q->latency, q->jitter,
584 				  &q->delay_cor, &q->prng, q->delay_dist);
585 
586 		now = ktime_get_ns();
587 
588 		if (q->rate) {
589 			struct netem_skb_cb *last = NULL;
590 
591 			if (sch->q.tail)
592 				last = netem_skb_cb(sch->q.tail);
593 			if (q->t_root.rb_node) {
594 				struct sk_buff *t_skb;
595 				struct netem_skb_cb *t_last;
596 
597 				t_skb = skb_rb_last(&q->t_root);
598 				t_last = netem_skb_cb(t_skb);
599 				if (!last ||
600 				    t_last->time_to_send > last->time_to_send)
601 					last = t_last;
602 			}
603 			if (q->t_tail) {
604 				struct netem_skb_cb *t_last =
605 					netem_skb_cb(q->t_tail);
606 
607 				if (!last ||
608 				    t_last->time_to_send > last->time_to_send)
609 					last = t_last;
610 			}
611 
612 			if (last) {
613 				/*
614 				 * Last packet in queue is reference point (now),
615 				 * calculate this time bonus and subtract
616 				 * from delay.
617 				 */
618 				delay -= last->time_to_send - now;
619 				delay = max_t(s64, 0, delay);
620 				now = last->time_to_send;
621 			}
622 
623 			delay += packet_time_ns(qdisc_pkt_len(skb), q);
624 		}
625 
626 		cb->time_to_send = now + delay;
627 		++q->counter;
628 		if (delay)
629 			WRITE_ONCE(q->delayed, q->delayed + 1);
630 
631 		tfifo_enqueue(skb, sch);
632 	} else {
633 		/*
634 		 * Do re-ordering by putting one out of N packets at the front
635 		 * of the queue.
636 		 */
637 		WRITE_ONCE(q->reordered, q->reordered + 1);
638 		cb->time_to_send = ktime_get_ns();
639 		q->counter = 0;
640 
641 		__qdisc_enqueue_head(skb, &sch->q);
642 		sch->qstats.requeues++;
643 	}
644 
645 finish_segs:
646 	if (skb2)
647 		__qdisc_drop(skb2, to_free);
648 
649 	if (segs) {
650 		unsigned int len, last_len;
651 		int rc, nb;
652 
653 		len = skb ? skb->len : 0;
654 		nb = skb ? 1 : 0;
655 
656 		while (segs) {
657 			skb2 = segs->next;
658 			skb_mark_not_on_list(segs);
659 			qdisc_skb_cb(segs)->pkt_len = segs->len;
660 			last_len = segs->len;
661 			rc = qdisc_enqueue(segs, sch, to_free);
662 			if (rc != NET_XMIT_SUCCESS) {
663 				if (net_xmit_drop_count(rc))
664 					qdisc_qstats_drop(sch);
665 			} else {
666 				nb++;
667 				len += last_len;
668 			}
669 			segs = skb2;
670 		}
671 		/* Parent qdiscs accounted for 1 skb of size @prev_len */
672 		qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len));
673 	} else if (!skb) {
674 		return NET_XMIT_DROP;
675 	}
676 	return NET_XMIT_SUCCESS;
677 }
678 
679 /* Delay the next round with a new future slot with a
680  * correct number of bytes and packets.
681  */
682 
683 static void get_slot_next(struct netem_sched_data *q, u64 now)
684 {
685 	s64 next_delay;
686 
687 	if (!q->slot_dist)
688 		next_delay = q->slot_config.min_delay +
689 			mul_u64_u32_shr(q->slot_config.max_delay - q->slot_config.min_delay,
690 					get_random_u32(), 32);
691 	else
692 		next_delay = tabledist(q->slot_config.dist_delay,
693 				       (s32)(q->slot_config.dist_jitter),
694 				       NULL, &q->prng, q->slot_dist);
695 
696 	q->slot.slot_next = now + next_delay;
697 	q->slot.packets_left = q->slot_config.max_packets;
698 	q->slot.bytes_left = q->slot_config.max_bytes;
699 }
700 
701 static struct sk_buff *netem_peek(struct netem_sched_data *q)
702 {
703 	struct sk_buff *skb = skb_rb_first(&q->t_root);
704 	u64 t1, t2;
705 
706 	if (!skb)
707 		return q->t_head;
708 	if (!q->t_head)
709 		return skb;
710 
711 	t1 = netem_skb_cb(skb)->time_to_send;
712 	t2 = netem_skb_cb(q->t_head)->time_to_send;
713 	if (t1 < t2)
714 		return skb;
715 	return q->t_head;
716 }
717 
718 static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb)
719 {
720 	if (skb == q->t_head) {
721 		q->t_head = skb->next;
722 		if (!q->t_head)
723 			q->t_tail = NULL;
724 	} else {
725 		rb_erase(&skb->rbnode, &q->t_root);
726 	}
727 }
728 
729 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
730 {
731 	struct netem_sched_data *q = qdisc_priv(sch);
732 	struct sk_buff *skb;
733 
734 tfifo_dequeue:
735 	skb = __qdisc_dequeue_head(&sch->q);
736 	if (skb) {
737 deliver:
738 		qdisc_qstats_backlog_dec(sch, skb);
739 		qdisc_bstats_update(sch, skb);
740 		return skb;
741 	}
742 	skb = netem_peek(q);
743 	if (skb) {
744 		u64 time_to_send;
745 		u64 now = ktime_get_ns();
746 
747 		/* if more time remaining? */
748 		time_to_send = netem_skb_cb(skb)->time_to_send;
749 		if (q->slot.slot_next && q->slot.slot_next < time_to_send)
750 			get_slot_next(q, now);
751 
752 		if (time_to_send <= now && q->slot.slot_next <= now) {
753 			netem_erase_head(q, skb);
754 			q->t_len--;
755 			skb->next = NULL;
756 			skb->prev = NULL;
757 			/* skb->dev shares skb->rbnode area,
758 			 * we need to restore its value.
759 			 */
760 			skb->dev = qdisc_dev(sch);
761 
762 			if (q->slot.slot_next) {
763 				q->slot.packets_left--;
764 				q->slot.bytes_left -= qdisc_pkt_len(skb);
765 				if (q->slot.packets_left <= 0 ||
766 				    q->slot.bytes_left <= 0)
767 					get_slot_next(q, now);
768 			}
769 
770 			if (q->qdisc) {
771 				unsigned int pkt_len = qdisc_pkt_len(skb);
772 				struct sk_buff *to_free = NULL;
773 				int err;
774 
775 				err = qdisc_enqueue(skb, q->qdisc, &to_free);
776 				kfree_skb_list(to_free);
777 				if (err != NET_XMIT_SUCCESS) {
778 					if (net_xmit_drop_count(err))
779 						qdisc_qstats_drop(sch);
780 					qstats_backlog_sub(sch, pkt_len);
781 					qdisc_qlen_dec(sch);
782 					qdisc_tree_reduce_backlog(sch, 1, pkt_len);
783 				}
784 				goto tfifo_dequeue;
785 			}
786 			qdisc_qlen_dec(sch);
787 			goto deliver;
788 		}
789 
790 		if (q->qdisc) {
791 			skb = q->qdisc->ops->dequeue(q->qdisc);
792 			if (skb) {
793 				qdisc_qlen_dec(sch);
794 				goto deliver;
795 			}
796 		}
797 
798 		qdisc_watchdog_schedule_ns(&q->watchdog,
799 					   max(time_to_send,
800 					       q->slot.slot_next));
801 	}
802 
803 	if (q->qdisc) {
804 		skb = q->qdisc->ops->dequeue(q->qdisc);
805 		if (skb) {
806 			qdisc_qlen_dec(sch);
807 			goto deliver;
808 		}
809 	}
810 	return NULL;
811 }
812 
813 static void netem_reset(struct Qdisc *sch)
814 {
815 	struct netem_sched_data *q = qdisc_priv(sch);
816 
817 	qdisc_reset_queue(sch);
818 	tfifo_reset(sch);
819 	if (q->qdisc)
820 		qdisc_reset(q->qdisc);
821 	qdisc_watchdog_cancel(&q->watchdog);
822 }
823 
824 static void dist_free(struct disttable *d)
825 {
826 	kvfree(d);
827 }
828 
829 /*
830  * Distribution data is a variable size payload containing
831  * signed 16 bit values.
832  */
833 
834 static int get_dist_table(struct disttable **tbl, const struct nlattr *attr)
835 {
836 	size_t n = nla_len(attr)/sizeof(__s16);
837 	const __s16 *data = nla_data(attr);
838 	struct disttable *d;
839 	int i;
840 
841 	if (!n || n > NETEM_DIST_MAX)
842 		return -EINVAL;
843 
844 	d = kvmalloc_flex(*d, table, n);
845 	if (!d)
846 		return -ENOMEM;
847 
848 	d->size = n;
849 	for (i = 0; i < n; i++)
850 		d->table[i] = data[i];
851 
852 	*tbl = d;
853 	return 0;
854 }
855 
856 static int validate_time(const struct nlattr *attr, const char *name,
857 			 struct netlink_ext_ack *extack)
858 {
859 	if (nla_get_s64(attr) < 0) {
860 		NL_SET_ERR_MSG_ATTR_FMT(extack, attr, "negative %s", name);
861 		return -EINVAL;
862 	}
863 	return 0;
864 }
865 
866 static int validate_slot(const struct nlattr *attr, struct netlink_ext_ack *extack)
867 {
868 	const struct tc_netem_slot *c = nla_data(attr);
869 
870 	if (c->min_delay < 0 || c->max_delay < 0) {
871 		NL_SET_ERR_MSG_ATTR(extack, attr, "negative slot delay");
872 		return -EINVAL;
873 	}
874 	if (c->min_delay > c->max_delay) {
875 		NL_SET_ERR_MSG_ATTR(extack, attr, "slot min delay greater than max delay");
876 		return -EINVAL;
877 	}
878 	if (c->dist_delay < 0 || c->dist_jitter < 0) {
879 		NL_SET_ERR_MSG_ATTR(extack, attr, "negative dist delay");
880 		return -EINVAL;
881 	}
882 	if (c->max_packets < 0 || c->max_bytes < 0) {
883 		NL_SET_ERR_MSG_ATTR(extack, attr, "negative slot limit");
884 		return -EINVAL;
885 	}
886 	return 0;
887 }
888 
889 static void get_slot(struct netem_sched_data *q, const struct nlattr *attr)
890 {
891 	const struct tc_netem_slot *c = nla_data(attr);
892 
893 	q->slot_config = *c;
894 	if (q->slot_config.max_packets == 0)
895 		q->slot_config.max_packets = INT_MAX;
896 	if (q->slot_config.max_bytes == 0)
897 		q->slot_config.max_bytes = INT_MAX;
898 
899 	/* capping dist_jitter to the range acceptable by tabledist() */
900 	q->slot_config.dist_jitter = min_t(__s64, INT_MAX, abs(q->slot_config.dist_jitter));
901 
902 	q->slot.packets_left = q->slot_config.max_packets;
903 	q->slot.bytes_left = q->slot_config.max_bytes;
904 	if (q->slot_config.min_delay | q->slot_config.max_delay |
905 	    q->slot_config.dist_jitter)
906 		q->slot.slot_next = ktime_get_ns();
907 	else
908 		q->slot.slot_next = 0;
909 }
910 
911 static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)
912 {
913 	const struct tc_netem_corr *c = nla_data(attr);
914 
915 	init_crandom(&q->delay_cor, c->delay_corr);
916 	init_crandom(&q->loss_cor, c->loss_corr);
917 	init_crandom(&q->dup_cor, c->dup_corr);
918 }
919 
920 static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr)
921 {
922 	const struct tc_netem_reorder *r = nla_data(attr);
923 
924 	q->reorder = r->probability;
925 	init_crandom(&q->reorder_cor, r->correlation);
926 }
927 
928 static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr)
929 {
930 	const struct tc_netem_corrupt *r = nla_data(attr);
931 
932 	q->corrupt = r->probability;
933 	init_crandom(&q->corrupt_cor, r->correlation);
934 }
935 
936 static void get_rate(struct netem_sched_data *q, const struct nlattr *attr)
937 {
938 	const struct tc_netem_rate *r = nla_data(attr);
939 
940 	q->rate = r->rate;
941 	q->packet_overhead = r->packet_overhead;
942 	q->cell_size = r->cell_size;
943 	q->cell_overhead = r->cell_overhead;
944 	if (q->cell_size)
945 		q->cell_size_reciprocal = reciprocal_value(q->cell_size);
946 	else
947 		q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
948 }
949 
950 static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr,
951 			struct netlink_ext_ack *extack)
952 {
953 	const struct nlattr *la;
954 	int rem;
955 
956 	nla_for_each_nested(la, attr, rem) {
957 		u16 type = nla_type(la);
958 
959 		switch (type) {
960 		case NETEM_LOSS_GI: {
961 			const struct tc_netem_gimodel *gi = nla_data(la);
962 
963 			if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
964 				NL_SET_ERR_MSG_ATTR(extack, la,
965 						    "netem: incorrect gi model size");
966 				return -EINVAL;
967 			}
968 
969 			q->loss_model = CLG_4_STATES;
970 
971 			q->clg.state = TX_IN_GAP_PERIOD;
972 			q->clg.a1 = gi->p13;
973 			q->clg.a2 = gi->p31;
974 			q->clg.a3 = gi->p32;
975 			q->clg.a4 = gi->p14;
976 			q->clg.a5 = gi->p23;
977 			break;
978 		}
979 
980 		case NETEM_LOSS_GE: {
981 			const struct tc_netem_gemodel *ge = nla_data(la);
982 
983 			if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
984 				NL_SET_ERR_MSG_ATTR(extack, la,
985 						    "netem: incorrect ge model size");
986 				return -EINVAL;
987 			}
988 
989 			q->loss_model = CLG_GILB_ELL;
990 			q->clg.state = GOOD_STATE;
991 			q->clg.a1 = ge->p;
992 			q->clg.a2 = ge->r;
993 			q->clg.a3 = ge->h;
994 			q->clg.a4 = ge->k1;
995 			break;
996 		}
997 
998 		default:
999 			NL_SET_ERR_MSG_ATTR_FMT(extack, la,
1000 						"netem: unknown loss type %u", type);
1001 			return -EINVAL;
1002 		}
1003 	}
1004 
1005 	return 0;
1006 }
1007 
1008 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
1009 	[TCA_NETEM_CORR]	= { .len = sizeof(struct tc_netem_corr) },
1010 	[TCA_NETEM_REORDER]	= { .len = sizeof(struct tc_netem_reorder) },
1011 	[TCA_NETEM_CORRUPT]	= { .len = sizeof(struct tc_netem_corrupt) },
1012 	[TCA_NETEM_RATE]	= { .len = sizeof(struct tc_netem_rate) },
1013 	[TCA_NETEM_LOSS]	= { .type = NLA_NESTED },
1014 	[TCA_NETEM_ECN]		= { .type = NLA_U32 },
1015 	[TCA_NETEM_RATE64]	= { .type = NLA_U64 },
1016 	[TCA_NETEM_LATENCY64]	= { .type = NLA_S64 },
1017 	[TCA_NETEM_JITTER64]	= { .type = NLA_S64 },
1018 	[TCA_NETEM_SLOT]	= { .len = sizeof(struct tc_netem_slot) },
1019 	[TCA_NETEM_PRNG_SEED]	= { .type = NLA_U64 },
1020 };
1021 
1022 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
1023 		      const struct nla_policy *policy, int len,
1024 		      struct netlink_ext_ack *extack)
1025 {
1026 	int nested_len = nla_len(nla) - NLA_ALIGN(len);
1027 
1028 	if (nested_len < 0) {
1029 		NL_SET_ERR_MSG_FMT(extack, "netem: invalid attributes len %d < %d",
1030 				   nla_len(nla), NLA_ALIGN(len));
1031 		return -EINVAL;
1032 	}
1033 
1034 	if (nested_len >= nla_attr_size(0))
1035 		return nla_parse_deprecated(tb, maxtype,
1036 					    nla_data(nla) + NLA_ALIGN(len),
1037 					    nested_len, policy, extack);
1038 
1039 	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
1040 	return 0;
1041 }
1042 
1043 /* Parse netlink message to set options */
1044 static int netem_change(struct Qdisc *sch, struct nlattr *opt,
1045 			struct netlink_ext_ack *extack)
1046 {
1047 	struct netem_sched_data *q = qdisc_priv(sch);
1048 	struct nlattr *tb[TCA_NETEM_MAX + 1];
1049 	struct disttable *delay_dist = NULL;
1050 	struct disttable *slot_dist = NULL;
1051 	struct tc_netem_qopt *qopt;
1052 	struct clgstate old_clg;
1053 	int old_loss_model = CLG_RANDOM;
1054 	int ret;
1055 
1056 	qopt = nla_data(opt);
1057 	ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt), extack);
1058 	if (ret < 0)
1059 		return ret;
1060 
1061 	if (tb[TCA_NETEM_DELAY_DIST]) {
1062 		ret = get_dist_table(&delay_dist, tb[TCA_NETEM_DELAY_DIST]);
1063 		if (ret)
1064 			goto table_free;
1065 	}
1066 
1067 	if (tb[TCA_NETEM_SLOT_DIST]) {
1068 		ret = get_dist_table(&slot_dist, tb[TCA_NETEM_SLOT_DIST]);
1069 		if (ret)
1070 			goto table_free;
1071 	}
1072 
1073 	if (tb[TCA_NETEM_SLOT]) {
1074 		ret = validate_slot(tb[TCA_NETEM_SLOT], extack);
1075 		if (ret)
1076 			goto table_free;
1077 	}
1078 
1079 	if (tb[TCA_NETEM_LATENCY64]) {
1080 		ret = validate_time(tb[TCA_NETEM_LATENCY64], "latency", extack);
1081 		if (ret)
1082 			goto table_free;
1083 	}
1084 
1085 	if (tb[TCA_NETEM_JITTER64]) {
1086 		ret = validate_time(tb[TCA_NETEM_JITTER64], "jitter", extack);
1087 		if (ret)
1088 			goto table_free;
1089 	}
1090 
1091 	sch_tree_lock(sch);
1092 	/* backup q->clg and q->loss_model */
1093 	old_clg = q->clg;
1094 	old_loss_model = q->loss_model;
1095 
1096 	if (tb[TCA_NETEM_LOSS]) {
1097 		ret = get_loss_clg(q, tb[TCA_NETEM_LOSS], extack);
1098 		if (ret) {
1099 			q->loss_model = old_loss_model;
1100 			q->clg = old_clg;
1101 			goto unlock;
1102 		}
1103 	} else {
1104 		q->loss_model = CLG_RANDOM;
1105 	}
1106 
1107 	if (delay_dist)
1108 		swap(q->delay_dist, delay_dist);
1109 	if (slot_dist)
1110 		swap(q->slot_dist, slot_dist);
1111 	sch->limit = qopt->limit;
1112 
1113 	q->latency = PSCHED_TICKS2NS(qopt->latency);
1114 	q->jitter = PSCHED_TICKS2NS(qopt->jitter);
1115 	q->limit = qopt->limit;
1116 	q->gap = qopt->gap;
1117 	q->counter = 0;
1118 	q->loss = qopt->loss;
1119 	q->duplicate = qopt->duplicate;
1120 
1121 	/* for compatibility with earlier versions.
1122 	 * if gap is set, need to assume 100% probability
1123 	 */
1124 	if (q->gap)
1125 		q->reorder = ~0;
1126 
1127 	if (tb[TCA_NETEM_CORR])
1128 		get_correlation(q, tb[TCA_NETEM_CORR]);
1129 
1130 	if (tb[TCA_NETEM_REORDER])
1131 		get_reorder(q, tb[TCA_NETEM_REORDER]);
1132 
1133 	if (tb[TCA_NETEM_CORRUPT])
1134 		get_corrupt(q, tb[TCA_NETEM_CORRUPT]);
1135 
1136 	if (tb[TCA_NETEM_RATE])
1137 		get_rate(q, tb[TCA_NETEM_RATE]);
1138 
1139 	if (tb[TCA_NETEM_RATE64])
1140 		q->rate = max_t(u64, q->rate,
1141 				nla_get_u64(tb[TCA_NETEM_RATE64]));
1142 
1143 	if (tb[TCA_NETEM_LATENCY64])
1144 		q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]);
1145 
1146 	if (tb[TCA_NETEM_JITTER64])
1147 		q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]);
1148 
1149 	if (tb[TCA_NETEM_ECN])
1150 		q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
1151 
1152 	if (tb[TCA_NETEM_SLOT])
1153 		get_slot(q, tb[TCA_NETEM_SLOT]);
1154 
1155 	/* capping jitter to the range acceptable by tabledist() */
1156 	q->jitter = min_t(s64, abs(q->jitter), INT_MAX);
1157 
1158 	if (tb[TCA_NETEM_PRNG_SEED]) {
1159 		q->prng.seed = nla_get_u64(tb[TCA_NETEM_PRNG_SEED]);
1160 		prandom_seed_state(&q->prng.prng_state, q->prng.seed);
1161 	}
1162 
1163 unlock:
1164 	sch_tree_unlock(sch);
1165 
1166 table_free:
1167 	dist_free(delay_dist);
1168 	dist_free(slot_dist);
1169 	return ret;
1170 }
1171 
1172 static int netem_init(struct Qdisc *sch, struct nlattr *opt,
1173 		      struct netlink_ext_ack *extack)
1174 {
1175 	struct netem_sched_data *q = qdisc_priv(sch);
1176 
1177 	qdisc_watchdog_init(&q->watchdog, sch);
1178 
1179 	if (!opt)
1180 		return -EINVAL;
1181 
1182 	q->loss_model = CLG_RANDOM;
1183 	q->prng.seed = get_random_u64();
1184 	prandom_seed_state(&q->prng.prng_state, q->prng.seed);
1185 
1186 	return netem_change(sch, opt, extack);
1187 }
1188 
1189 static void netem_destroy(struct Qdisc *sch)
1190 {
1191 	struct netem_sched_data *q = qdisc_priv(sch);
1192 
1193 	qdisc_watchdog_cancel(&q->watchdog);
1194 	if (q->qdisc)
1195 		qdisc_put(q->qdisc);
1196 	dist_free(q->delay_dist);
1197 	dist_free(q->slot_dist);
1198 }
1199 
1200 static int dump_loss_model(const struct netem_sched_data *q,
1201 			   struct sk_buff *skb)
1202 {
1203 	struct nlattr *nest;
1204 
1205 	nest = nla_nest_start_noflag(skb, TCA_NETEM_LOSS);
1206 	if (nest == NULL)
1207 		goto nla_put_failure;
1208 
1209 	switch (q->loss_model) {
1210 	case CLG_RANDOM:
1211 		/* legacy loss model */
1212 		nla_nest_cancel(skb, nest);
1213 		return 0;	/* no data */
1214 
1215 	case CLG_4_STATES: {
1216 		struct tc_netem_gimodel gi = {
1217 			.p13 = q->clg.a1,
1218 			.p31 = q->clg.a2,
1219 			.p32 = q->clg.a3,
1220 			.p14 = q->clg.a4,
1221 			.p23 = q->clg.a5,
1222 		};
1223 
1224 		if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
1225 			goto nla_put_failure;
1226 		break;
1227 	}
1228 	case CLG_GILB_ELL: {
1229 		struct tc_netem_gemodel ge = {
1230 			.p = q->clg.a1,
1231 			.r = q->clg.a2,
1232 			.h = q->clg.a3,
1233 			.k1 = q->clg.a4,
1234 		};
1235 
1236 		if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
1237 			goto nla_put_failure;
1238 		break;
1239 	}
1240 	}
1241 
1242 	nla_nest_end(skb, nest);
1243 	return 0;
1244 
1245 nla_put_failure:
1246 	nla_nest_cancel(skb, nest);
1247 	return -1;
1248 }
1249 
1250 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
1251 {
1252 	const struct netem_sched_data *q = qdisc_priv(sch);
1253 	struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
1254 	struct tc_netem_qopt qopt;
1255 	struct tc_netem_corr cor;
1256 	struct tc_netem_reorder reorder;
1257 	struct tc_netem_corrupt corrupt;
1258 	struct tc_netem_rate rate;
1259 	struct tc_netem_slot slot;
1260 
1261 	qopt.latency = min_t(psched_time_t, PSCHED_NS2TICKS(q->latency),
1262 			     UINT_MAX);
1263 	qopt.jitter = min_t(psched_time_t, PSCHED_NS2TICKS(q->jitter),
1264 			    UINT_MAX);
1265 	qopt.limit = q->limit;
1266 	qopt.loss = q->loss;
1267 	qopt.gap = q->gap;
1268 	qopt.duplicate = q->duplicate;
1269 	if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
1270 		goto nla_put_failure;
1271 
1272 	if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency))
1273 		goto nla_put_failure;
1274 
1275 	if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter))
1276 		goto nla_put_failure;
1277 
1278 	cor.delay_corr = q->delay_cor.rho;
1279 	cor.loss_corr = q->loss_cor.rho;
1280 	cor.dup_corr = q->dup_cor.rho;
1281 	if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
1282 		goto nla_put_failure;
1283 
1284 	reorder.probability = q->reorder;
1285 	reorder.correlation = q->reorder_cor.rho;
1286 	if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
1287 		goto nla_put_failure;
1288 
1289 	corrupt.probability = q->corrupt;
1290 	corrupt.correlation = q->corrupt_cor.rho;
1291 	if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
1292 		goto nla_put_failure;
1293 
1294 	if (q->rate >= (1ULL << 32)) {
1295 		if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate,
1296 				      TCA_NETEM_PAD))
1297 			goto nla_put_failure;
1298 		rate.rate = ~0U;
1299 	} else {
1300 		rate.rate = q->rate;
1301 	}
1302 	rate.packet_overhead = q->packet_overhead;
1303 	rate.cell_size = q->cell_size;
1304 	rate.cell_overhead = q->cell_overhead;
1305 	if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
1306 		goto nla_put_failure;
1307 
1308 	if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
1309 		goto nla_put_failure;
1310 
1311 	if (dump_loss_model(q, skb) != 0)
1312 		goto nla_put_failure;
1313 
1314 	if (q->slot_config.min_delay | q->slot_config.max_delay |
1315 	    q->slot_config.dist_jitter) {
1316 		slot = q->slot_config;
1317 		if (slot.max_packets == INT_MAX)
1318 			slot.max_packets = 0;
1319 		if (slot.max_bytes == INT_MAX)
1320 			slot.max_bytes = 0;
1321 		if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot))
1322 			goto nla_put_failure;
1323 	}
1324 
1325 	if (nla_put_u64_64bit(skb, TCA_NETEM_PRNG_SEED, q->prng.seed,
1326 			      TCA_NETEM_PAD))
1327 		goto nla_put_failure;
1328 
1329 	return nla_nest_end(skb, nla);
1330 
1331 nla_put_failure:
1332 	nlmsg_trim(skb, nla);
1333 	return -1;
1334 }
1335 
1336 static int netem_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1337 {
1338 	struct netem_sched_data *q = qdisc_priv(sch);
1339 	struct tc_netem_xstats st = {
1340 		.delayed    = READ_ONCE(q->delayed),
1341 		.dropped    = READ_ONCE(q->dropped),
1342 		.corrupted  = READ_ONCE(q->corrupted),
1343 		.duplicated = READ_ONCE(q->duplicated),
1344 		.reordered  = READ_ONCE(q->reordered),
1345 		.ecn_marked = READ_ONCE(q->ecn_marked),
1346 		.allocation_errors = READ_ONCE(q->allocation_errors),
1347 	};
1348 
1349 	return gnet_stats_copy_app(d, &st, sizeof(st));
1350 }
1351 
1352 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
1353 			  struct sk_buff *skb, struct tcmsg *tcm)
1354 {
1355 	struct netem_sched_data *q = qdisc_priv(sch);
1356 
1357 	if (cl != 1 || !q->qdisc) 	/* only one class */
1358 		return -ENOENT;
1359 
1360 	tcm->tcm_handle |= TC_H_MIN(1);
1361 	tcm->tcm_info = q->qdisc->handle;
1362 
1363 	return 0;
1364 }
1365 
1366 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1367 		     struct Qdisc **old, struct netlink_ext_ack *extack)
1368 {
1369 	struct netem_sched_data *q = qdisc_priv(sch);
1370 
1371 	*old = qdisc_replace(sch, new, &q->qdisc);
1372 	return 0;
1373 }
1374 
1375 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
1376 {
1377 	struct netem_sched_data *q = qdisc_priv(sch);
1378 	return q->qdisc;
1379 }
1380 
1381 static unsigned long netem_find(struct Qdisc *sch, u32 classid)
1382 {
1383 	return 1;
1384 }
1385 
1386 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
1387 {
1388 	if (!walker->stop) {
1389 		if (!tc_qdisc_stats_dump(sch, 1, walker))
1390 			return;
1391 	}
1392 }
1393 
1394 static const struct Qdisc_class_ops netem_class_ops = {
1395 	.graft		=	netem_graft,
1396 	.leaf		=	netem_leaf,
1397 	.find		=	netem_find,
1398 	.walk		=	netem_walk,
1399 	.dump		=	netem_dump_class,
1400 };
1401 
1402 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
1403 	.id		=	"netem",
1404 	.cl_ops		=	&netem_class_ops,
1405 	.priv_size	=	sizeof(struct netem_sched_data),
1406 	.enqueue	=	netem_enqueue,
1407 	.dequeue	=	netem_dequeue,
1408 	.peek		=	qdisc_peek_dequeued,
1409 	.init		=	netem_init,
1410 	.reset		=	netem_reset,
1411 	.destroy	=	netem_destroy,
1412 	.change		=	netem_change,
1413 	.dump		=	netem_dump,
1414 	.dump_stats	=	netem_dump_stats,
1415 	.owner		=	THIS_MODULE,
1416 };
1417 MODULE_ALIAS_NET_SCH("netem");
1418 
1419 static int __init netem_module_init(void)
1420 {
1421 	return register_qdisc(&netem_qdisc_ops);
1422 }
1423 static void __exit netem_module_exit(void)
1424 {
1425 	unregister_qdisc(&netem_qdisc_ops);
1426 }
1427 module_init(netem_module_init)
1428 module_exit(netem_module_exit)
1429 MODULE_LICENSE("GPL");
1430 MODULE_DESCRIPTION("Network characteristics emulator qdisc");
1431