1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * net/sched/sch_netem.c Network emulator 4 * 5 * Many of the algorithms and ideas for this came from 6 * NIST Net which is not copyrighted. 7 * 8 * Authors: Stephen Hemminger <shemminger@osdl.org> 9 * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro> 10 */ 11 12 #include <linux/mm.h> 13 #include <linux/module.h> 14 #include <linux/slab.h> 15 #include <linux/types.h> 16 #include <linux/kernel.h> 17 #include <linux/errno.h> 18 #include <linux/skbuff.h> 19 #include <linux/vmalloc.h> 20 #include <linux/prandom.h> 21 #include <linux/rtnetlink.h> 22 #include <linux/reciprocal_div.h> 23 #include <linux/rbtree.h> 24 25 #include <net/gso.h> 26 #include <net/netlink.h> 27 #include <net/pkt_sched.h> 28 #include <net/inet_ecn.h> 29 30 #define VERSION "1.3" 31 32 /* Network Emulation Queuing algorithm. 33 ==================================== 34 35 Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based 36 Network Emulation Tool 37 [2] Luigi Rizzo, DummyNet for FreeBSD 38 39 ---------------------------------------------------------------- 40 41 This started out as a simple way to delay outgoing packets to 42 test TCP but has grown to include most of the functionality 43 of a full blown network emulator like NISTnet. It can delay 44 packets and add random jitter (and correlation). The random 45 distribution can be loaded from a table as well to provide 46 normal, Pareto, or experimental curves. Packet loss, 47 duplication, and reordering can also be emulated. 48 49 This qdisc does not do classification that can be handled in 50 layering other disciplines. It does not need to do bandwidth 51 control either since that can be handled by using token 52 bucket or other rate control. 53 54 Correlated Loss Generator models 55 56 Added generation of correlated loss according to the 57 "Gilbert-Elliot" model, a 4-state markov model. 58 59 References: 60 [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG 61 [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general 62 and intuitive loss model for packet networks and its implementation 63 in the Netem module in the Linux kernel", available in [1] 64 65 Authors: Stefano Salsano <stefano.salsano at uniroma2.it 66 Fabio Ludovici <fabio.ludovici at yahoo.it> 67 */ 68 69 struct disttable { 70 u32 size; 71 s16 table[] __counted_by(size); 72 }; 73 74 struct netem_sched_data { 75 /* internal t(ime)fifo qdisc uses t_root and sch->limit */ 76 struct rb_root t_root; 77 78 /* a linear queue; reduces rbtree rebalancing when jitter is low */ 79 struct sk_buff *t_head; 80 struct sk_buff *t_tail; 81 82 u32 t_len; 83 84 /* optional qdisc for classful handling (NULL at netem init) */ 85 struct Qdisc *qdisc; 86 87 struct qdisc_watchdog watchdog; 88 89 s64 latency; 90 s64 jitter; 91 92 u32 loss; 93 u32 ecn; 94 u32 limit; 95 u32 counter; 96 u32 gap; 97 u32 duplicate; 98 u32 reorder; 99 u32 corrupt; 100 u64 rate; 101 s32 packet_overhead; 102 u32 cell_size; 103 struct reciprocal_value cell_size_reciprocal; 104 s32 cell_overhead; 105 106 struct crndstate { 107 u32 last; 108 u32 rho; 109 } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor; 110 111 struct prng { 112 u64 seed; 113 struct rnd_state prng_state; 114 } prng; 115 116 struct disttable *delay_dist; 117 118 enum { 119 CLG_RANDOM, 120 CLG_4_STATES, 121 CLG_GILB_ELL, 122 } loss_model; 123 124 enum { 125 TX_IN_GAP_PERIOD = 1, 126 TX_IN_BURST_PERIOD, 127 LOST_IN_GAP_PERIOD, 128 LOST_IN_BURST_PERIOD, 129 } _4_state_model; 130 131 enum { 132 GOOD_STATE = 1, 133 BAD_STATE, 134 } GE_state_model; 135 136 /* Correlated Loss Generation models */ 137 struct clgstate { 138 /* state of the Markov chain */ 139 u8 state; 140 141 /* 4-states and Gilbert-Elliot models */ 142 u32 a1; /* p13 for 4-states or p for GE */ 143 u32 a2; /* p31 for 4-states or r for GE */ 144 u32 a3; /* p32 for 4-states or h for GE */ 145 u32 a4; /* p14 for 4-states or 1-k for GE */ 146 u32 a5; /* p23 used only in 4-states */ 147 } clg; 148 149 struct tc_netem_slot slot_config; 150 struct slotstate { 151 u64 slot_next; 152 s32 packets_left; 153 s32 bytes_left; 154 } slot; 155 156 struct disttable *slot_dist; 157 }; 158 159 /* Time stamp put into socket buffer control block 160 * Only valid when skbs are in our internal t(ime)fifo queue. 161 * 162 * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp, 163 * and skb->next & skb->prev are scratch space for a qdisc, 164 * we save skb->tstamp value in skb->cb[] before destroying it. 165 */ 166 struct netem_skb_cb { 167 u64 time_to_send; 168 }; 169 170 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) 171 { 172 /* we assume we can use skb next/prev/tstamp as storage for rb_node */ 173 qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb)); 174 return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data; 175 } 176 177 /* init_crandom - initialize correlated random number generator 178 * Use entropy source for initial seed. 179 */ 180 static void init_crandom(struct crndstate *state, unsigned long rho) 181 { 182 state->rho = rho; 183 state->last = get_random_u32(); 184 } 185 186 /* get_crandom - correlated random number generator 187 * Next number depends on last value. 188 * rho is scaled to avoid floating point. 189 */ 190 static u32 get_crandom(struct crndstate *state, struct prng *p) 191 { 192 u64 value, rho; 193 unsigned long answer; 194 struct rnd_state *s = &p->prng_state; 195 196 if (!state || state->rho == 0) /* no correlation */ 197 return prandom_u32_state(s); 198 199 value = prandom_u32_state(s); 200 rho = (u64)state->rho + 1; 201 answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32; 202 state->last = answer; 203 return answer; 204 } 205 206 /* loss_4state - 4-state model loss generator 207 * Generates losses according to the 4-state Markov chain adopted in 208 * the GI (General and Intuitive) loss model. 209 */ 210 static bool loss_4state(struct netem_sched_data *q) 211 { 212 struct clgstate *clg = &q->clg; 213 u32 rnd = prandom_u32_state(&q->prng.prng_state); 214 215 /* 216 * Makes a comparison between rnd and the transition 217 * probabilities outgoing from the current state, then decides the 218 * next state and if the next packet has to be transmitted or lost. 219 * The four states correspond to: 220 * TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period 221 * LOST_IN_GAP_PERIOD => isolated losses within a gap period 222 * LOST_IN_BURST_PERIOD => lost packets within a burst period 223 * TX_IN_BURST_PERIOD => successfully transmitted packets within a burst period 224 */ 225 switch (clg->state) { 226 case TX_IN_GAP_PERIOD: 227 if (rnd < clg->a4) { 228 clg->state = LOST_IN_GAP_PERIOD; 229 return true; 230 } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) { 231 clg->state = LOST_IN_BURST_PERIOD; 232 return true; 233 } else if (clg->a1 + clg->a4 < rnd) { 234 clg->state = TX_IN_GAP_PERIOD; 235 } 236 237 break; 238 case TX_IN_BURST_PERIOD: 239 if (rnd < clg->a5) { 240 clg->state = LOST_IN_BURST_PERIOD; 241 return true; 242 } else { 243 clg->state = TX_IN_BURST_PERIOD; 244 } 245 246 break; 247 case LOST_IN_BURST_PERIOD: 248 if (rnd < clg->a3) 249 clg->state = TX_IN_BURST_PERIOD; 250 else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) { 251 clg->state = TX_IN_GAP_PERIOD; 252 } else if (clg->a2 + clg->a3 < rnd) { 253 clg->state = LOST_IN_BURST_PERIOD; 254 return true; 255 } 256 break; 257 case LOST_IN_GAP_PERIOD: 258 clg->state = TX_IN_GAP_PERIOD; 259 break; 260 } 261 262 return false; 263 } 264 265 /* loss_gilb_ell - Gilbert-Elliot model loss generator 266 * Generates losses according to the Gilbert-Elliot loss model or 267 * its special cases (Gilbert or Simple Gilbert) 268 * 269 * Makes a comparison between random number and the transition 270 * probabilities outgoing from the current state, then decides the 271 * next state. A second random number is extracted and the comparison 272 * with the loss probability of the current state decides if the next 273 * packet will be transmitted or lost. 274 */ 275 static bool loss_gilb_ell(struct netem_sched_data *q) 276 { 277 struct clgstate *clg = &q->clg; 278 struct rnd_state *s = &q->prng.prng_state; 279 280 switch (clg->state) { 281 case GOOD_STATE: 282 if (prandom_u32_state(s) < clg->a1) 283 clg->state = BAD_STATE; 284 if (prandom_u32_state(s) < clg->a4) 285 return true; 286 break; 287 case BAD_STATE: 288 if (prandom_u32_state(s) < clg->a2) 289 clg->state = GOOD_STATE; 290 if (prandom_u32_state(s) > clg->a3) 291 return true; 292 } 293 294 return false; 295 } 296 297 static bool loss_event(struct netem_sched_data *q) 298 { 299 switch (q->loss_model) { 300 case CLG_RANDOM: 301 /* Random packet drop 0 => none, ~0 => all */ 302 return q->loss && q->loss >= get_crandom(&q->loss_cor, &q->prng); 303 304 case CLG_4_STATES: 305 /* 4state loss model algorithm (used also for GI model) 306 * Extracts a value from the markov 4 state loss generator, 307 * if it is 1 drops a packet and if needed writes the event in 308 * the kernel logs 309 */ 310 return loss_4state(q); 311 312 case CLG_GILB_ELL: 313 /* Gilbert-Elliot loss model algorithm 314 * Extracts a value from the Gilbert-Elliot loss generator, 315 * if it is 1 drops a packet and if needed writes the event in 316 * the kernel logs 317 */ 318 return loss_gilb_ell(q); 319 } 320 321 return false; /* not reached */ 322 } 323 324 325 /* tabledist - return a pseudo-randomly distributed value with mean mu and 326 * std deviation sigma. Uses table lookup to approximate the desired 327 * distribution, and a uniformly-distributed pseudo-random source. 328 */ 329 static s64 tabledist(s64 mu, s32 sigma, 330 struct crndstate *state, 331 struct prng *prng, 332 const struct disttable *dist) 333 { 334 s64 x; 335 long t; 336 u32 rnd; 337 338 if (sigma == 0) 339 return mu; 340 341 rnd = get_crandom(state, prng); 342 343 /* default uniform distribution */ 344 if (dist == NULL) 345 return ((rnd % (2 * (u32)sigma)) + mu) - sigma; 346 347 t = dist->table[rnd % dist->size]; 348 x = (sigma % NETEM_DIST_SCALE) * t; 349 if (x >= 0) 350 x += NETEM_DIST_SCALE/2; 351 else 352 x -= NETEM_DIST_SCALE/2; 353 354 return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu; 355 } 356 357 static u64 packet_time_ns(u64 len, const struct netem_sched_data *q) 358 { 359 len += q->packet_overhead; 360 361 if (q->cell_size) { 362 u32 cells = reciprocal_divide(len, q->cell_size_reciprocal); 363 364 if (len > cells * q->cell_size) /* extra cell needed for remainder */ 365 cells++; 366 len = cells * (q->cell_size + q->cell_overhead); 367 } 368 369 return div64_u64(len * NSEC_PER_SEC, q->rate); 370 } 371 372 static void tfifo_reset(struct Qdisc *sch) 373 { 374 struct netem_sched_data *q = qdisc_priv(sch); 375 struct rb_node *p = rb_first(&q->t_root); 376 377 while (p) { 378 struct sk_buff *skb = rb_to_skb(p); 379 380 p = rb_next(p); 381 rb_erase(&skb->rbnode, &q->t_root); 382 rtnl_kfree_skbs(skb, skb); 383 } 384 385 rtnl_kfree_skbs(q->t_head, q->t_tail); 386 q->t_head = NULL; 387 q->t_tail = NULL; 388 q->t_len = 0; 389 } 390 391 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) 392 { 393 struct netem_sched_data *q = qdisc_priv(sch); 394 u64 tnext = netem_skb_cb(nskb)->time_to_send; 395 396 if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) { 397 if (q->t_tail) 398 q->t_tail->next = nskb; 399 else 400 q->t_head = nskb; 401 q->t_tail = nskb; 402 } else { 403 struct rb_node **p = &q->t_root.rb_node, *parent = NULL; 404 405 while (*p) { 406 struct sk_buff *skb; 407 408 parent = *p; 409 skb = rb_to_skb(parent); 410 if (tnext >= netem_skb_cb(skb)->time_to_send) 411 p = &parent->rb_right; 412 else 413 p = &parent->rb_left; 414 } 415 rb_link_node(&nskb->rbnode, parent, p); 416 rb_insert_color(&nskb->rbnode, &q->t_root); 417 } 418 q->t_len++; 419 sch->q.qlen++; 420 } 421 422 /* netem can't properly corrupt a megapacket (like we get from GSO), so instead 423 * when we statistically choose to corrupt one, we instead segment it, returning 424 * the first packet to be corrupted, and re-enqueue the remaining frames 425 */ 426 static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch, 427 struct sk_buff **to_free) 428 { 429 struct sk_buff *segs; 430 netdev_features_t features = netif_skb_features(skb); 431 432 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 433 434 if (IS_ERR_OR_NULL(segs)) { 435 qdisc_drop(skb, sch, to_free); 436 return NULL; 437 } 438 consume_skb(skb); 439 return segs; 440 } 441 442 /* 443 * Insert one skb into qdisc. 444 * Note: parent depends on return value to account for queue length. 445 * NET_XMIT_DROP: queue length didn't change. 446 * NET_XMIT_SUCCESS: one skb was queued. 447 */ 448 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, 449 struct sk_buff **to_free) 450 { 451 struct netem_sched_data *q = qdisc_priv(sch); 452 /* We don't fill cb now as skb_unshare() may invalidate it */ 453 struct netem_skb_cb *cb; 454 struct sk_buff *skb2 = NULL; 455 struct sk_buff *segs = NULL; 456 unsigned int prev_len = qdisc_pkt_len(skb); 457 int count = 1; 458 459 /* Do not fool qdisc_drop_all() */ 460 skb->prev = NULL; 461 462 /* Random duplication */ 463 if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor, &q->prng)) 464 ++count; 465 466 /* Drop packet? */ 467 if (loss_event(q)) { 468 if (q->ecn && INET_ECN_set_ce(skb)) 469 qdisc_qstats_drop(sch); /* mark packet */ 470 else 471 --count; 472 } 473 if (count == 0) { 474 qdisc_qstats_drop(sch); 475 __qdisc_drop(skb, to_free); 476 return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; 477 } 478 479 /* If a delay is expected, orphan the skb. (orphaning usually takes 480 * place at TX completion time, so _before_ the link transit delay) 481 */ 482 if (q->latency || q->jitter || q->rate) 483 skb_orphan_partial(skb); 484 485 /* 486 * If we need to duplicate packet, then clone it before 487 * original is modified. 488 */ 489 if (count > 1) 490 skb2 = skb_clone(skb, GFP_ATOMIC); 491 492 /* 493 * Randomized packet corruption. 494 * Make copy if needed since we are modifying 495 * If packet is going to be hardware checksummed, then 496 * do it now in software before we mangle it. 497 */ 498 if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor, &q->prng)) { 499 if (skb_is_gso(skb)) { 500 skb = netem_segment(skb, sch, to_free); 501 if (!skb) 502 goto finish_segs; 503 504 segs = skb->next; 505 skb_mark_not_on_list(skb); 506 qdisc_skb_cb(skb)->pkt_len = skb->len; 507 } 508 509 skb = skb_unshare(skb, GFP_ATOMIC); 510 if (unlikely(!skb)) { 511 qdisc_qstats_drop(sch); 512 goto finish_segs; 513 } 514 if (skb->ip_summed == CHECKSUM_PARTIAL && 515 skb_checksum_help(skb)) { 516 qdisc_drop(skb, sch, to_free); 517 skb = NULL; 518 goto finish_segs; 519 } 520 521 skb->data[get_random_u32_below(skb_headlen(skb))] ^= 522 1<<get_random_u32_below(8); 523 } 524 525 if (unlikely(q->t_len >= sch->limit)) { 526 /* re-link segs, so that qdisc_drop_all() frees them all */ 527 skb->next = segs; 528 qdisc_drop_all(skb, sch, to_free); 529 if (skb2) 530 __qdisc_drop(skb2, to_free); 531 return NET_XMIT_DROP; 532 } 533 534 /* 535 * If doing duplication then re-insert at top of the 536 * qdisc tree, since parent queuer expects that only one 537 * skb will be queued. 538 */ 539 if (skb2) { 540 struct Qdisc *rootq = qdisc_root_bh(sch); 541 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ 542 543 q->duplicate = 0; 544 rootq->enqueue(skb2, rootq, to_free); 545 q->duplicate = dupsave; 546 skb2 = NULL; 547 } 548 549 qdisc_qstats_backlog_inc(sch, skb); 550 551 cb = netem_skb_cb(skb); 552 if (q->gap == 0 || /* not doing reordering */ 553 q->counter < q->gap - 1 || /* inside last reordering gap */ 554 q->reorder < get_crandom(&q->reorder_cor, &q->prng)) { 555 u64 now; 556 s64 delay; 557 558 delay = tabledist(q->latency, q->jitter, 559 &q->delay_cor, &q->prng, q->delay_dist); 560 561 now = ktime_get_ns(); 562 563 if (q->rate) { 564 struct netem_skb_cb *last = NULL; 565 566 if (sch->q.tail) 567 last = netem_skb_cb(sch->q.tail); 568 if (q->t_root.rb_node) { 569 struct sk_buff *t_skb; 570 struct netem_skb_cb *t_last; 571 572 t_skb = skb_rb_last(&q->t_root); 573 t_last = netem_skb_cb(t_skb); 574 if (!last || 575 t_last->time_to_send > last->time_to_send) 576 last = t_last; 577 } 578 if (q->t_tail) { 579 struct netem_skb_cb *t_last = 580 netem_skb_cb(q->t_tail); 581 582 if (!last || 583 t_last->time_to_send > last->time_to_send) 584 last = t_last; 585 } 586 587 if (last) { 588 /* 589 * Last packet in queue is reference point (now), 590 * calculate this time bonus and subtract 591 * from delay. 592 */ 593 delay -= last->time_to_send - now; 594 delay = max_t(s64, 0, delay); 595 now = last->time_to_send; 596 } 597 598 delay += packet_time_ns(qdisc_pkt_len(skb), q); 599 } 600 601 cb->time_to_send = now + delay; 602 ++q->counter; 603 tfifo_enqueue(skb, sch); 604 } else { 605 /* 606 * Do re-ordering by putting one out of N packets at the front 607 * of the queue. 608 */ 609 cb->time_to_send = ktime_get_ns(); 610 q->counter = 0; 611 612 __qdisc_enqueue_head(skb, &sch->q); 613 sch->qstats.requeues++; 614 } 615 616 finish_segs: 617 if (skb2) 618 __qdisc_drop(skb2, to_free); 619 620 if (segs) { 621 unsigned int len, last_len; 622 int rc, nb; 623 624 len = skb ? skb->len : 0; 625 nb = skb ? 1 : 0; 626 627 while (segs) { 628 skb2 = segs->next; 629 skb_mark_not_on_list(segs); 630 qdisc_skb_cb(segs)->pkt_len = segs->len; 631 last_len = segs->len; 632 rc = qdisc_enqueue(segs, sch, to_free); 633 if (rc != NET_XMIT_SUCCESS) { 634 if (net_xmit_drop_count(rc)) 635 qdisc_qstats_drop(sch); 636 } else { 637 nb++; 638 len += last_len; 639 } 640 segs = skb2; 641 } 642 /* Parent qdiscs accounted for 1 skb of size @prev_len */ 643 qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len)); 644 } else if (!skb) { 645 return NET_XMIT_DROP; 646 } 647 return NET_XMIT_SUCCESS; 648 } 649 650 /* Delay the next round with a new future slot with a 651 * correct number of bytes and packets. 652 */ 653 654 static void get_slot_next(struct netem_sched_data *q, u64 now) 655 { 656 s64 next_delay; 657 658 if (!q->slot_dist) 659 next_delay = q->slot_config.min_delay + 660 (get_random_u32() * 661 (q->slot_config.max_delay - 662 q->slot_config.min_delay) >> 32); 663 else 664 next_delay = tabledist(q->slot_config.dist_delay, 665 (s32)(q->slot_config.dist_jitter), 666 NULL, &q->prng, q->slot_dist); 667 668 q->slot.slot_next = now + next_delay; 669 q->slot.packets_left = q->slot_config.max_packets; 670 q->slot.bytes_left = q->slot_config.max_bytes; 671 } 672 673 static struct sk_buff *netem_peek(struct netem_sched_data *q) 674 { 675 struct sk_buff *skb = skb_rb_first(&q->t_root); 676 u64 t1, t2; 677 678 if (!skb) 679 return q->t_head; 680 if (!q->t_head) 681 return skb; 682 683 t1 = netem_skb_cb(skb)->time_to_send; 684 t2 = netem_skb_cb(q->t_head)->time_to_send; 685 if (t1 < t2) 686 return skb; 687 return q->t_head; 688 } 689 690 static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb) 691 { 692 if (skb == q->t_head) { 693 q->t_head = skb->next; 694 if (!q->t_head) 695 q->t_tail = NULL; 696 } else { 697 rb_erase(&skb->rbnode, &q->t_root); 698 } 699 } 700 701 static struct sk_buff *netem_dequeue(struct Qdisc *sch) 702 { 703 struct netem_sched_data *q = qdisc_priv(sch); 704 struct sk_buff *skb; 705 706 tfifo_dequeue: 707 skb = __qdisc_dequeue_head(&sch->q); 708 if (skb) { 709 deliver: 710 qdisc_qstats_backlog_dec(sch, skb); 711 qdisc_bstats_update(sch, skb); 712 return skb; 713 } 714 skb = netem_peek(q); 715 if (skb) { 716 u64 time_to_send; 717 u64 now = ktime_get_ns(); 718 719 /* if more time remaining? */ 720 time_to_send = netem_skb_cb(skb)->time_to_send; 721 if (q->slot.slot_next && q->slot.slot_next < time_to_send) 722 get_slot_next(q, now); 723 724 if (time_to_send <= now && q->slot.slot_next <= now) { 725 netem_erase_head(q, skb); 726 q->t_len--; 727 skb->next = NULL; 728 skb->prev = NULL; 729 /* skb->dev shares skb->rbnode area, 730 * we need to restore its value. 731 */ 732 skb->dev = qdisc_dev(sch); 733 734 if (q->slot.slot_next) { 735 q->slot.packets_left--; 736 q->slot.bytes_left -= qdisc_pkt_len(skb); 737 if (q->slot.packets_left <= 0 || 738 q->slot.bytes_left <= 0) 739 get_slot_next(q, now); 740 } 741 742 if (q->qdisc) { 743 unsigned int pkt_len = qdisc_pkt_len(skb); 744 struct sk_buff *to_free = NULL; 745 int err; 746 747 err = qdisc_enqueue(skb, q->qdisc, &to_free); 748 kfree_skb_list(to_free); 749 if (err != NET_XMIT_SUCCESS) { 750 if (net_xmit_drop_count(err)) 751 qdisc_qstats_drop(sch); 752 sch->qstats.backlog -= pkt_len; 753 sch->q.qlen--; 754 qdisc_tree_reduce_backlog(sch, 1, pkt_len); 755 } 756 goto tfifo_dequeue; 757 } 758 sch->q.qlen--; 759 goto deliver; 760 } 761 762 if (q->qdisc) { 763 skb = q->qdisc->ops->dequeue(q->qdisc); 764 if (skb) { 765 sch->q.qlen--; 766 goto deliver; 767 } 768 } 769 770 qdisc_watchdog_schedule_ns(&q->watchdog, 771 max(time_to_send, 772 q->slot.slot_next)); 773 } 774 775 if (q->qdisc) { 776 skb = q->qdisc->ops->dequeue(q->qdisc); 777 if (skb) { 778 sch->q.qlen--; 779 goto deliver; 780 } 781 } 782 return NULL; 783 } 784 785 static void netem_reset(struct Qdisc *sch) 786 { 787 struct netem_sched_data *q = qdisc_priv(sch); 788 789 qdisc_reset_queue(sch); 790 tfifo_reset(sch); 791 if (q->qdisc) 792 qdisc_reset(q->qdisc); 793 qdisc_watchdog_cancel(&q->watchdog); 794 } 795 796 static void dist_free(struct disttable *d) 797 { 798 kvfree(d); 799 } 800 801 /* 802 * Distribution data is a variable size payload containing 803 * signed 16 bit values. 804 */ 805 806 static int get_dist_table(struct disttable **tbl, const struct nlattr *attr) 807 { 808 size_t n = nla_len(attr)/sizeof(__s16); 809 const __s16 *data = nla_data(attr); 810 struct disttable *d; 811 int i; 812 813 if (!n || n > NETEM_DIST_MAX) 814 return -EINVAL; 815 816 d = kvmalloc(struct_size(d, table, n), GFP_KERNEL); 817 if (!d) 818 return -ENOMEM; 819 820 d->size = n; 821 for (i = 0; i < n; i++) 822 d->table[i] = data[i]; 823 824 *tbl = d; 825 return 0; 826 } 827 828 static void get_slot(struct netem_sched_data *q, const struct nlattr *attr) 829 { 830 const struct tc_netem_slot *c = nla_data(attr); 831 832 q->slot_config = *c; 833 if (q->slot_config.max_packets == 0) 834 q->slot_config.max_packets = INT_MAX; 835 if (q->slot_config.max_bytes == 0) 836 q->slot_config.max_bytes = INT_MAX; 837 838 /* capping dist_jitter to the range acceptable by tabledist() */ 839 q->slot_config.dist_jitter = min_t(__s64, INT_MAX, abs(q->slot_config.dist_jitter)); 840 841 q->slot.packets_left = q->slot_config.max_packets; 842 q->slot.bytes_left = q->slot_config.max_bytes; 843 if (q->slot_config.min_delay | q->slot_config.max_delay | 844 q->slot_config.dist_jitter) 845 q->slot.slot_next = ktime_get_ns(); 846 else 847 q->slot.slot_next = 0; 848 } 849 850 static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr) 851 { 852 const struct tc_netem_corr *c = nla_data(attr); 853 854 init_crandom(&q->delay_cor, c->delay_corr); 855 init_crandom(&q->loss_cor, c->loss_corr); 856 init_crandom(&q->dup_cor, c->dup_corr); 857 } 858 859 static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr) 860 { 861 const struct tc_netem_reorder *r = nla_data(attr); 862 863 q->reorder = r->probability; 864 init_crandom(&q->reorder_cor, r->correlation); 865 } 866 867 static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr) 868 { 869 const struct tc_netem_corrupt *r = nla_data(attr); 870 871 q->corrupt = r->probability; 872 init_crandom(&q->corrupt_cor, r->correlation); 873 } 874 875 static void get_rate(struct netem_sched_data *q, const struct nlattr *attr) 876 { 877 const struct tc_netem_rate *r = nla_data(attr); 878 879 q->rate = r->rate; 880 q->packet_overhead = r->packet_overhead; 881 q->cell_size = r->cell_size; 882 q->cell_overhead = r->cell_overhead; 883 if (q->cell_size) 884 q->cell_size_reciprocal = reciprocal_value(q->cell_size); 885 else 886 q->cell_size_reciprocal = (struct reciprocal_value) { 0 }; 887 } 888 889 static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr) 890 { 891 const struct nlattr *la; 892 int rem; 893 894 nla_for_each_nested(la, attr, rem) { 895 u16 type = nla_type(la); 896 897 switch (type) { 898 case NETEM_LOSS_GI: { 899 const struct tc_netem_gimodel *gi = nla_data(la); 900 901 if (nla_len(la) < sizeof(struct tc_netem_gimodel)) { 902 pr_info("netem: incorrect gi model size\n"); 903 return -EINVAL; 904 } 905 906 q->loss_model = CLG_4_STATES; 907 908 q->clg.state = TX_IN_GAP_PERIOD; 909 q->clg.a1 = gi->p13; 910 q->clg.a2 = gi->p31; 911 q->clg.a3 = gi->p32; 912 q->clg.a4 = gi->p14; 913 q->clg.a5 = gi->p23; 914 break; 915 } 916 917 case NETEM_LOSS_GE: { 918 const struct tc_netem_gemodel *ge = nla_data(la); 919 920 if (nla_len(la) < sizeof(struct tc_netem_gemodel)) { 921 pr_info("netem: incorrect ge model size\n"); 922 return -EINVAL; 923 } 924 925 q->loss_model = CLG_GILB_ELL; 926 q->clg.state = GOOD_STATE; 927 q->clg.a1 = ge->p; 928 q->clg.a2 = ge->r; 929 q->clg.a3 = ge->h; 930 q->clg.a4 = ge->k1; 931 break; 932 } 933 934 default: 935 pr_info("netem: unknown loss type %u\n", type); 936 return -EINVAL; 937 } 938 } 939 940 return 0; 941 } 942 943 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = { 944 [TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) }, 945 [TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) }, 946 [TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) }, 947 [TCA_NETEM_RATE] = { .len = sizeof(struct tc_netem_rate) }, 948 [TCA_NETEM_LOSS] = { .type = NLA_NESTED }, 949 [TCA_NETEM_ECN] = { .type = NLA_U32 }, 950 [TCA_NETEM_RATE64] = { .type = NLA_U64 }, 951 [TCA_NETEM_LATENCY64] = { .type = NLA_S64 }, 952 [TCA_NETEM_JITTER64] = { .type = NLA_S64 }, 953 [TCA_NETEM_SLOT] = { .len = sizeof(struct tc_netem_slot) }, 954 [TCA_NETEM_PRNG_SEED] = { .type = NLA_U64 }, 955 }; 956 957 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, 958 const struct nla_policy *policy, int len) 959 { 960 int nested_len = nla_len(nla) - NLA_ALIGN(len); 961 962 if (nested_len < 0) { 963 pr_info("netem: invalid attributes len %d\n", nested_len); 964 return -EINVAL; 965 } 966 967 if (nested_len >= nla_attr_size(0)) 968 return nla_parse_deprecated(tb, maxtype, 969 nla_data(nla) + NLA_ALIGN(len), 970 nested_len, policy, NULL); 971 972 memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); 973 return 0; 974 } 975 976 static const struct Qdisc_class_ops netem_class_ops; 977 978 static int check_netem_in_tree(struct Qdisc *sch, bool duplicates, 979 struct netlink_ext_ack *extack) 980 { 981 struct Qdisc *root, *q; 982 unsigned int i; 983 984 root = qdisc_root_sleeping(sch); 985 986 if (sch != root && root->ops->cl_ops == &netem_class_ops) { 987 if (duplicates || 988 ((struct netem_sched_data *)qdisc_priv(root))->duplicate) 989 goto err; 990 } 991 992 if (!qdisc_dev(root)) 993 return 0; 994 995 hash_for_each(qdisc_dev(root)->qdisc_hash, i, q, hash) { 996 if (sch != q && q->ops->cl_ops == &netem_class_ops) { 997 if (duplicates || 998 ((struct netem_sched_data *)qdisc_priv(q))->duplicate) 999 goto err; 1000 } 1001 } 1002 1003 return 0; 1004 1005 err: 1006 NL_SET_ERR_MSG(extack, 1007 "netem: cannot mix duplicating netems with other netems in tree"); 1008 return -EINVAL; 1009 } 1010 1011 /* Parse netlink message to set options */ 1012 static int netem_change(struct Qdisc *sch, struct nlattr *opt, 1013 struct netlink_ext_ack *extack) 1014 { 1015 struct netem_sched_data *q = qdisc_priv(sch); 1016 struct nlattr *tb[TCA_NETEM_MAX + 1]; 1017 struct disttable *delay_dist = NULL; 1018 struct disttable *slot_dist = NULL; 1019 struct tc_netem_qopt *qopt; 1020 struct clgstate old_clg; 1021 int old_loss_model = CLG_RANDOM; 1022 int ret; 1023 1024 qopt = nla_data(opt); 1025 ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt)); 1026 if (ret < 0) 1027 return ret; 1028 1029 if (tb[TCA_NETEM_DELAY_DIST]) { 1030 ret = get_dist_table(&delay_dist, tb[TCA_NETEM_DELAY_DIST]); 1031 if (ret) 1032 goto table_free; 1033 } 1034 1035 if (tb[TCA_NETEM_SLOT_DIST]) { 1036 ret = get_dist_table(&slot_dist, tb[TCA_NETEM_SLOT_DIST]); 1037 if (ret) 1038 goto table_free; 1039 } 1040 1041 sch_tree_lock(sch); 1042 /* backup q->clg and q->loss_model */ 1043 old_clg = q->clg; 1044 old_loss_model = q->loss_model; 1045 1046 if (tb[TCA_NETEM_LOSS]) { 1047 ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]); 1048 if (ret) { 1049 q->loss_model = old_loss_model; 1050 q->clg = old_clg; 1051 goto unlock; 1052 } 1053 } else { 1054 q->loss_model = CLG_RANDOM; 1055 } 1056 1057 if (delay_dist) 1058 swap(q->delay_dist, delay_dist); 1059 if (slot_dist) 1060 swap(q->slot_dist, slot_dist); 1061 sch->limit = qopt->limit; 1062 1063 q->latency = PSCHED_TICKS2NS(qopt->latency); 1064 q->jitter = PSCHED_TICKS2NS(qopt->jitter); 1065 q->limit = qopt->limit; 1066 q->gap = qopt->gap; 1067 q->counter = 0; 1068 q->loss = qopt->loss; 1069 1070 ret = check_netem_in_tree(sch, qopt->duplicate, extack); 1071 if (ret) 1072 goto unlock; 1073 1074 q->duplicate = qopt->duplicate; 1075 1076 /* for compatibility with earlier versions. 1077 * if gap is set, need to assume 100% probability 1078 */ 1079 if (q->gap) 1080 q->reorder = ~0; 1081 1082 if (tb[TCA_NETEM_CORR]) 1083 get_correlation(q, tb[TCA_NETEM_CORR]); 1084 1085 if (tb[TCA_NETEM_REORDER]) 1086 get_reorder(q, tb[TCA_NETEM_REORDER]); 1087 1088 if (tb[TCA_NETEM_CORRUPT]) 1089 get_corrupt(q, tb[TCA_NETEM_CORRUPT]); 1090 1091 if (tb[TCA_NETEM_RATE]) 1092 get_rate(q, tb[TCA_NETEM_RATE]); 1093 1094 if (tb[TCA_NETEM_RATE64]) 1095 q->rate = max_t(u64, q->rate, 1096 nla_get_u64(tb[TCA_NETEM_RATE64])); 1097 1098 if (tb[TCA_NETEM_LATENCY64]) 1099 q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]); 1100 1101 if (tb[TCA_NETEM_JITTER64]) 1102 q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]); 1103 1104 if (tb[TCA_NETEM_ECN]) 1105 q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]); 1106 1107 if (tb[TCA_NETEM_SLOT]) 1108 get_slot(q, tb[TCA_NETEM_SLOT]); 1109 1110 /* capping jitter to the range acceptable by tabledist() */ 1111 q->jitter = min_t(s64, abs(q->jitter), INT_MAX); 1112 1113 if (tb[TCA_NETEM_PRNG_SEED]) 1114 q->prng.seed = nla_get_u64(tb[TCA_NETEM_PRNG_SEED]); 1115 else 1116 q->prng.seed = get_random_u64(); 1117 prandom_seed_state(&q->prng.prng_state, q->prng.seed); 1118 1119 unlock: 1120 sch_tree_unlock(sch); 1121 1122 table_free: 1123 dist_free(delay_dist); 1124 dist_free(slot_dist); 1125 return ret; 1126 } 1127 1128 static int netem_init(struct Qdisc *sch, struct nlattr *opt, 1129 struct netlink_ext_ack *extack) 1130 { 1131 struct netem_sched_data *q = qdisc_priv(sch); 1132 int ret; 1133 1134 qdisc_watchdog_init(&q->watchdog, sch); 1135 1136 if (!opt) 1137 return -EINVAL; 1138 1139 q->loss_model = CLG_RANDOM; 1140 ret = netem_change(sch, opt, extack); 1141 if (ret) 1142 pr_info("netem: change failed\n"); 1143 return ret; 1144 } 1145 1146 static void netem_destroy(struct Qdisc *sch) 1147 { 1148 struct netem_sched_data *q = qdisc_priv(sch); 1149 1150 qdisc_watchdog_cancel(&q->watchdog); 1151 if (q->qdisc) 1152 qdisc_put(q->qdisc); 1153 dist_free(q->delay_dist); 1154 dist_free(q->slot_dist); 1155 } 1156 1157 static int dump_loss_model(const struct netem_sched_data *q, 1158 struct sk_buff *skb) 1159 { 1160 struct nlattr *nest; 1161 1162 nest = nla_nest_start_noflag(skb, TCA_NETEM_LOSS); 1163 if (nest == NULL) 1164 goto nla_put_failure; 1165 1166 switch (q->loss_model) { 1167 case CLG_RANDOM: 1168 /* legacy loss model */ 1169 nla_nest_cancel(skb, nest); 1170 return 0; /* no data */ 1171 1172 case CLG_4_STATES: { 1173 struct tc_netem_gimodel gi = { 1174 .p13 = q->clg.a1, 1175 .p31 = q->clg.a2, 1176 .p32 = q->clg.a3, 1177 .p14 = q->clg.a4, 1178 .p23 = q->clg.a5, 1179 }; 1180 1181 if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi)) 1182 goto nla_put_failure; 1183 break; 1184 } 1185 case CLG_GILB_ELL: { 1186 struct tc_netem_gemodel ge = { 1187 .p = q->clg.a1, 1188 .r = q->clg.a2, 1189 .h = q->clg.a3, 1190 .k1 = q->clg.a4, 1191 }; 1192 1193 if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge)) 1194 goto nla_put_failure; 1195 break; 1196 } 1197 } 1198 1199 nla_nest_end(skb, nest); 1200 return 0; 1201 1202 nla_put_failure: 1203 nla_nest_cancel(skb, nest); 1204 return -1; 1205 } 1206 1207 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) 1208 { 1209 const struct netem_sched_data *q = qdisc_priv(sch); 1210 struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb); 1211 struct tc_netem_qopt qopt; 1212 struct tc_netem_corr cor; 1213 struct tc_netem_reorder reorder; 1214 struct tc_netem_corrupt corrupt; 1215 struct tc_netem_rate rate; 1216 struct tc_netem_slot slot; 1217 1218 qopt.latency = min_t(psched_time_t, PSCHED_NS2TICKS(q->latency), 1219 UINT_MAX); 1220 qopt.jitter = min_t(psched_time_t, PSCHED_NS2TICKS(q->jitter), 1221 UINT_MAX); 1222 qopt.limit = q->limit; 1223 qopt.loss = q->loss; 1224 qopt.gap = q->gap; 1225 qopt.duplicate = q->duplicate; 1226 if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt)) 1227 goto nla_put_failure; 1228 1229 if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency)) 1230 goto nla_put_failure; 1231 1232 if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter)) 1233 goto nla_put_failure; 1234 1235 cor.delay_corr = q->delay_cor.rho; 1236 cor.loss_corr = q->loss_cor.rho; 1237 cor.dup_corr = q->dup_cor.rho; 1238 if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor)) 1239 goto nla_put_failure; 1240 1241 reorder.probability = q->reorder; 1242 reorder.correlation = q->reorder_cor.rho; 1243 if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder)) 1244 goto nla_put_failure; 1245 1246 corrupt.probability = q->corrupt; 1247 corrupt.correlation = q->corrupt_cor.rho; 1248 if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt)) 1249 goto nla_put_failure; 1250 1251 if (q->rate >= (1ULL << 32)) { 1252 if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate, 1253 TCA_NETEM_PAD)) 1254 goto nla_put_failure; 1255 rate.rate = ~0U; 1256 } else { 1257 rate.rate = q->rate; 1258 } 1259 rate.packet_overhead = q->packet_overhead; 1260 rate.cell_size = q->cell_size; 1261 rate.cell_overhead = q->cell_overhead; 1262 if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate)) 1263 goto nla_put_failure; 1264 1265 if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn)) 1266 goto nla_put_failure; 1267 1268 if (dump_loss_model(q, skb) != 0) 1269 goto nla_put_failure; 1270 1271 if (q->slot_config.min_delay | q->slot_config.max_delay | 1272 q->slot_config.dist_jitter) { 1273 slot = q->slot_config; 1274 if (slot.max_packets == INT_MAX) 1275 slot.max_packets = 0; 1276 if (slot.max_bytes == INT_MAX) 1277 slot.max_bytes = 0; 1278 if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot)) 1279 goto nla_put_failure; 1280 } 1281 1282 if (nla_put_u64_64bit(skb, TCA_NETEM_PRNG_SEED, q->prng.seed, 1283 TCA_NETEM_PAD)) 1284 goto nla_put_failure; 1285 1286 return nla_nest_end(skb, nla); 1287 1288 nla_put_failure: 1289 nlmsg_trim(skb, nla); 1290 return -1; 1291 } 1292 1293 static int netem_dump_class(struct Qdisc *sch, unsigned long cl, 1294 struct sk_buff *skb, struct tcmsg *tcm) 1295 { 1296 struct netem_sched_data *q = qdisc_priv(sch); 1297 1298 if (cl != 1 || !q->qdisc) /* only one class */ 1299 return -ENOENT; 1300 1301 tcm->tcm_handle |= TC_H_MIN(1); 1302 tcm->tcm_info = q->qdisc->handle; 1303 1304 return 0; 1305 } 1306 1307 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, 1308 struct Qdisc **old, struct netlink_ext_ack *extack) 1309 { 1310 struct netem_sched_data *q = qdisc_priv(sch); 1311 1312 *old = qdisc_replace(sch, new, &q->qdisc); 1313 return 0; 1314 } 1315 1316 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg) 1317 { 1318 struct netem_sched_data *q = qdisc_priv(sch); 1319 return q->qdisc; 1320 } 1321 1322 static unsigned long netem_find(struct Qdisc *sch, u32 classid) 1323 { 1324 return 1; 1325 } 1326 1327 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker) 1328 { 1329 if (!walker->stop) { 1330 if (!tc_qdisc_stats_dump(sch, 1, walker)) 1331 return; 1332 } 1333 } 1334 1335 static const struct Qdisc_class_ops netem_class_ops = { 1336 .graft = netem_graft, 1337 .leaf = netem_leaf, 1338 .find = netem_find, 1339 .walk = netem_walk, 1340 .dump = netem_dump_class, 1341 }; 1342 1343 static struct Qdisc_ops netem_qdisc_ops __read_mostly = { 1344 .id = "netem", 1345 .cl_ops = &netem_class_ops, 1346 .priv_size = sizeof(struct netem_sched_data), 1347 .enqueue = netem_enqueue, 1348 .dequeue = netem_dequeue, 1349 .peek = qdisc_peek_dequeued, 1350 .init = netem_init, 1351 .reset = netem_reset, 1352 .destroy = netem_destroy, 1353 .change = netem_change, 1354 .dump = netem_dump, 1355 .owner = THIS_MODULE, 1356 }; 1357 MODULE_ALIAS_NET_SCH("netem"); 1358 1359 1360 static int __init netem_module_init(void) 1361 { 1362 pr_info("netem: version " VERSION "\n"); 1363 return register_qdisc(&netem_qdisc_ops); 1364 } 1365 static void __exit netem_module_exit(void) 1366 { 1367 unregister_qdisc(&netem_qdisc_ops); 1368 } 1369 module_init(netem_module_init) 1370 module_exit(netem_module_exit) 1371 MODULE_LICENSE("GPL"); 1372 MODULE_DESCRIPTION("Network characteristics emulator qdisc"); 1373