1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * net/sched/sch_netem.c Network emulator 4 * 5 * Many of the algorithms and ideas for this came from 6 * NIST Net which is not copyrighted. 7 * 8 * Authors: Stephen Hemminger <shemminger@osdl.org> 9 * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro> 10 */ 11 12 #include <linux/mm.h> 13 #include <linux/module.h> 14 #include <linux/slab.h> 15 #include <linux/types.h> 16 #include <linux/kernel.h> 17 #include <linux/errno.h> 18 #include <linux/skbuff.h> 19 #include <linux/vmalloc.h> 20 #include <linux/rtnetlink.h> 21 #include <linux/reciprocal_div.h> 22 #include <linux/rbtree.h> 23 24 #include <net/gso.h> 25 #include <net/netlink.h> 26 #include <net/pkt_sched.h> 27 #include <net/inet_ecn.h> 28 29 #define VERSION "1.3" 30 31 /* Network Emulation Queuing algorithm. 32 ==================================== 33 34 Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based 35 Network Emulation Tool 36 [2] Luigi Rizzo, DummyNet for FreeBSD 37 38 ---------------------------------------------------------------- 39 40 This started out as a simple way to delay outgoing packets to 41 test TCP but has grown to include most of the functionality 42 of a full blown network emulator like NISTnet. It can delay 43 packets and add random jitter (and correlation). The random 44 distribution can be loaded from a table as well to provide 45 normal, Pareto, or experimental curves. Packet loss, 46 duplication, and reordering can also be emulated. 47 48 This qdisc does not do classification that can be handled in 49 layering other disciplines. It does not need to do bandwidth 50 control either since that can be handled by using token 51 bucket or other rate control. 52 53 Correlated Loss Generator models 54 55 Added generation of correlated loss according to the 56 "Gilbert-Elliot" model, a 4-state markov model. 57 58 References: 59 [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG 60 [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general 61 and intuitive loss model for packet networks and its implementation 62 in the Netem module in the Linux kernel", available in [1] 63 64 Authors: Stefano Salsano <stefano.salsano at uniroma2.it 65 Fabio Ludovici <fabio.ludovici at yahoo.it> 66 */ 67 68 struct disttable { 69 u32 size; 70 s16 table[]; 71 }; 72 73 struct netem_sched_data { 74 /* internal t(ime)fifo qdisc uses t_root and sch->limit */ 75 struct rb_root t_root; 76 77 /* a linear queue; reduces rbtree rebalancing when jitter is low */ 78 struct sk_buff *t_head; 79 struct sk_buff *t_tail; 80 81 /* optional qdisc for classful handling (NULL at netem init) */ 82 struct Qdisc *qdisc; 83 84 struct qdisc_watchdog watchdog; 85 86 s64 latency; 87 s64 jitter; 88 89 u32 loss; 90 u32 ecn; 91 u32 limit; 92 u32 counter; 93 u32 gap; 94 u32 duplicate; 95 u32 reorder; 96 u32 corrupt; 97 u64 rate; 98 s32 packet_overhead; 99 u32 cell_size; 100 struct reciprocal_value cell_size_reciprocal; 101 s32 cell_overhead; 102 103 struct crndstate { 104 u32 last; 105 u32 rho; 106 } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor; 107 108 struct disttable *delay_dist; 109 110 enum { 111 CLG_RANDOM, 112 CLG_4_STATES, 113 CLG_GILB_ELL, 114 } loss_model; 115 116 enum { 117 TX_IN_GAP_PERIOD = 1, 118 TX_IN_BURST_PERIOD, 119 LOST_IN_GAP_PERIOD, 120 LOST_IN_BURST_PERIOD, 121 } _4_state_model; 122 123 enum { 124 GOOD_STATE = 1, 125 BAD_STATE, 126 } GE_state_model; 127 128 /* Correlated Loss Generation models */ 129 struct clgstate { 130 /* state of the Markov chain */ 131 u8 state; 132 133 /* 4-states and Gilbert-Elliot models */ 134 u32 a1; /* p13 for 4-states or p for GE */ 135 u32 a2; /* p31 for 4-states or r for GE */ 136 u32 a3; /* p32 for 4-states or h for GE */ 137 u32 a4; /* p14 for 4-states or 1-k for GE */ 138 u32 a5; /* p23 used only in 4-states */ 139 } clg; 140 141 struct tc_netem_slot slot_config; 142 struct slotstate { 143 u64 slot_next; 144 s32 packets_left; 145 s32 bytes_left; 146 } slot; 147 148 struct disttable *slot_dist; 149 }; 150 151 /* Time stamp put into socket buffer control block 152 * Only valid when skbs are in our internal t(ime)fifo queue. 153 * 154 * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp, 155 * and skb->next & skb->prev are scratch space for a qdisc, 156 * we save skb->tstamp value in skb->cb[] before destroying it. 157 */ 158 struct netem_skb_cb { 159 u64 time_to_send; 160 }; 161 162 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) 163 { 164 /* we assume we can use skb next/prev/tstamp as storage for rb_node */ 165 qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb)); 166 return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data; 167 } 168 169 /* init_crandom - initialize correlated random number generator 170 * Use entropy source for initial seed. 171 */ 172 static void init_crandom(struct crndstate *state, unsigned long rho) 173 { 174 state->rho = rho; 175 state->last = get_random_u32(); 176 } 177 178 /* get_crandom - correlated random number generator 179 * Next number depends on last value. 180 * rho is scaled to avoid floating point. 181 */ 182 static u32 get_crandom(struct crndstate *state) 183 { 184 u64 value, rho; 185 unsigned long answer; 186 187 if (!state || state->rho == 0) /* no correlation */ 188 return get_random_u32(); 189 190 value = get_random_u32(); 191 rho = (u64)state->rho + 1; 192 answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32; 193 state->last = answer; 194 return answer; 195 } 196 197 /* loss_4state - 4-state model loss generator 198 * Generates losses according to the 4-state Markov chain adopted in 199 * the GI (General and Intuitive) loss model. 200 */ 201 static bool loss_4state(struct netem_sched_data *q) 202 { 203 struct clgstate *clg = &q->clg; 204 u32 rnd = get_random_u32(); 205 206 /* 207 * Makes a comparison between rnd and the transition 208 * probabilities outgoing from the current state, then decides the 209 * next state and if the next packet has to be transmitted or lost. 210 * The four states correspond to: 211 * TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period 212 * LOST_IN_GAP_PERIOD => isolated losses within a gap period 213 * LOST_IN_BURST_PERIOD => lost packets within a burst period 214 * TX_IN_BURST_PERIOD => successfully transmitted packets within a burst period 215 */ 216 switch (clg->state) { 217 case TX_IN_GAP_PERIOD: 218 if (rnd < clg->a4) { 219 clg->state = LOST_IN_GAP_PERIOD; 220 return true; 221 } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) { 222 clg->state = LOST_IN_BURST_PERIOD; 223 return true; 224 } else if (clg->a1 + clg->a4 < rnd) { 225 clg->state = TX_IN_GAP_PERIOD; 226 } 227 228 break; 229 case TX_IN_BURST_PERIOD: 230 if (rnd < clg->a5) { 231 clg->state = LOST_IN_BURST_PERIOD; 232 return true; 233 } else { 234 clg->state = TX_IN_BURST_PERIOD; 235 } 236 237 break; 238 case LOST_IN_BURST_PERIOD: 239 if (rnd < clg->a3) 240 clg->state = TX_IN_BURST_PERIOD; 241 else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) { 242 clg->state = TX_IN_GAP_PERIOD; 243 } else if (clg->a2 + clg->a3 < rnd) { 244 clg->state = LOST_IN_BURST_PERIOD; 245 return true; 246 } 247 break; 248 case LOST_IN_GAP_PERIOD: 249 clg->state = TX_IN_GAP_PERIOD; 250 break; 251 } 252 253 return false; 254 } 255 256 /* loss_gilb_ell - Gilbert-Elliot model loss generator 257 * Generates losses according to the Gilbert-Elliot loss model or 258 * its special cases (Gilbert or Simple Gilbert) 259 * 260 * Makes a comparison between random number and the transition 261 * probabilities outgoing from the current state, then decides the 262 * next state. A second random number is extracted and the comparison 263 * with the loss probability of the current state decides if the next 264 * packet will be transmitted or lost. 265 */ 266 static bool loss_gilb_ell(struct netem_sched_data *q) 267 { 268 struct clgstate *clg = &q->clg; 269 270 switch (clg->state) { 271 case GOOD_STATE: 272 if (get_random_u32() < clg->a1) 273 clg->state = BAD_STATE; 274 if (get_random_u32() < clg->a4) 275 return true; 276 break; 277 case BAD_STATE: 278 if (get_random_u32() < clg->a2) 279 clg->state = GOOD_STATE; 280 if (get_random_u32() > clg->a3) 281 return true; 282 } 283 284 return false; 285 } 286 287 static bool loss_event(struct netem_sched_data *q) 288 { 289 switch (q->loss_model) { 290 case CLG_RANDOM: 291 /* Random packet drop 0 => none, ~0 => all */ 292 return q->loss && q->loss >= get_crandom(&q->loss_cor); 293 294 case CLG_4_STATES: 295 /* 4state loss model algorithm (used also for GI model) 296 * Extracts a value from the markov 4 state loss generator, 297 * if it is 1 drops a packet and if needed writes the event in 298 * the kernel logs 299 */ 300 return loss_4state(q); 301 302 case CLG_GILB_ELL: 303 /* Gilbert-Elliot loss model algorithm 304 * Extracts a value from the Gilbert-Elliot loss generator, 305 * if it is 1 drops a packet and if needed writes the event in 306 * the kernel logs 307 */ 308 return loss_gilb_ell(q); 309 } 310 311 return false; /* not reached */ 312 } 313 314 315 /* tabledist - return a pseudo-randomly distributed value with mean mu and 316 * std deviation sigma. Uses table lookup to approximate the desired 317 * distribution, and a uniformly-distributed pseudo-random source. 318 */ 319 static s64 tabledist(s64 mu, s32 sigma, 320 struct crndstate *state, 321 const struct disttable *dist) 322 { 323 s64 x; 324 long t; 325 u32 rnd; 326 327 if (sigma == 0) 328 return mu; 329 330 rnd = get_crandom(state); 331 332 /* default uniform distribution */ 333 if (dist == NULL) 334 return ((rnd % (2 * (u32)sigma)) + mu) - sigma; 335 336 t = dist->table[rnd % dist->size]; 337 x = (sigma % NETEM_DIST_SCALE) * t; 338 if (x >= 0) 339 x += NETEM_DIST_SCALE/2; 340 else 341 x -= NETEM_DIST_SCALE/2; 342 343 return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu; 344 } 345 346 static u64 packet_time_ns(u64 len, const struct netem_sched_data *q) 347 { 348 len += q->packet_overhead; 349 350 if (q->cell_size) { 351 u32 cells = reciprocal_divide(len, q->cell_size_reciprocal); 352 353 if (len > cells * q->cell_size) /* extra cell needed for remainder */ 354 cells++; 355 len = cells * (q->cell_size + q->cell_overhead); 356 } 357 358 return div64_u64(len * NSEC_PER_SEC, q->rate); 359 } 360 361 static void tfifo_reset(struct Qdisc *sch) 362 { 363 struct netem_sched_data *q = qdisc_priv(sch); 364 struct rb_node *p = rb_first(&q->t_root); 365 366 while (p) { 367 struct sk_buff *skb = rb_to_skb(p); 368 369 p = rb_next(p); 370 rb_erase(&skb->rbnode, &q->t_root); 371 rtnl_kfree_skbs(skb, skb); 372 } 373 374 rtnl_kfree_skbs(q->t_head, q->t_tail); 375 q->t_head = NULL; 376 q->t_tail = NULL; 377 } 378 379 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) 380 { 381 struct netem_sched_data *q = qdisc_priv(sch); 382 u64 tnext = netem_skb_cb(nskb)->time_to_send; 383 384 if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) { 385 if (q->t_tail) 386 q->t_tail->next = nskb; 387 else 388 q->t_head = nskb; 389 q->t_tail = nskb; 390 } else { 391 struct rb_node **p = &q->t_root.rb_node, *parent = NULL; 392 393 while (*p) { 394 struct sk_buff *skb; 395 396 parent = *p; 397 skb = rb_to_skb(parent); 398 if (tnext >= netem_skb_cb(skb)->time_to_send) 399 p = &parent->rb_right; 400 else 401 p = &parent->rb_left; 402 } 403 rb_link_node(&nskb->rbnode, parent, p); 404 rb_insert_color(&nskb->rbnode, &q->t_root); 405 } 406 sch->q.qlen++; 407 } 408 409 /* netem can't properly corrupt a megapacket (like we get from GSO), so instead 410 * when we statistically choose to corrupt one, we instead segment it, returning 411 * the first packet to be corrupted, and re-enqueue the remaining frames 412 */ 413 static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch, 414 struct sk_buff **to_free) 415 { 416 struct sk_buff *segs; 417 netdev_features_t features = netif_skb_features(skb); 418 419 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 420 421 if (IS_ERR_OR_NULL(segs)) { 422 qdisc_drop(skb, sch, to_free); 423 return NULL; 424 } 425 consume_skb(skb); 426 return segs; 427 } 428 429 /* 430 * Insert one skb into qdisc. 431 * Note: parent depends on return value to account for queue length. 432 * NET_XMIT_DROP: queue length didn't change. 433 * NET_XMIT_SUCCESS: one skb was queued. 434 */ 435 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, 436 struct sk_buff **to_free) 437 { 438 struct netem_sched_data *q = qdisc_priv(sch); 439 /* We don't fill cb now as skb_unshare() may invalidate it */ 440 struct netem_skb_cb *cb; 441 struct sk_buff *skb2; 442 struct sk_buff *segs = NULL; 443 unsigned int prev_len = qdisc_pkt_len(skb); 444 int count = 1; 445 int rc = NET_XMIT_SUCCESS; 446 int rc_drop = NET_XMIT_DROP; 447 448 /* Do not fool qdisc_drop_all() */ 449 skb->prev = NULL; 450 451 /* Random duplication */ 452 if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) 453 ++count; 454 455 /* Drop packet? */ 456 if (loss_event(q)) { 457 if (q->ecn && INET_ECN_set_ce(skb)) 458 qdisc_qstats_drop(sch); /* mark packet */ 459 else 460 --count; 461 } 462 if (count == 0) { 463 qdisc_qstats_drop(sch); 464 __qdisc_drop(skb, to_free); 465 return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; 466 } 467 468 /* If a delay is expected, orphan the skb. (orphaning usually takes 469 * place at TX completion time, so _before_ the link transit delay) 470 */ 471 if (q->latency || q->jitter || q->rate) 472 skb_orphan_partial(skb); 473 474 /* 475 * If we need to duplicate packet, then re-insert at top of the 476 * qdisc tree, since parent queuer expects that only one 477 * skb will be queued. 478 */ 479 if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) { 480 struct Qdisc *rootq = qdisc_root_bh(sch); 481 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ 482 483 q->duplicate = 0; 484 rootq->enqueue(skb2, rootq, to_free); 485 q->duplicate = dupsave; 486 rc_drop = NET_XMIT_SUCCESS; 487 } 488 489 /* 490 * Randomized packet corruption. 491 * Make copy if needed since we are modifying 492 * If packet is going to be hardware checksummed, then 493 * do it now in software before we mangle it. 494 */ 495 if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) { 496 if (skb_is_gso(skb)) { 497 skb = netem_segment(skb, sch, to_free); 498 if (!skb) 499 return rc_drop; 500 segs = skb->next; 501 skb_mark_not_on_list(skb); 502 qdisc_skb_cb(skb)->pkt_len = skb->len; 503 } 504 505 skb = skb_unshare(skb, GFP_ATOMIC); 506 if (unlikely(!skb)) { 507 qdisc_qstats_drop(sch); 508 goto finish_segs; 509 } 510 if (skb->ip_summed == CHECKSUM_PARTIAL && 511 skb_checksum_help(skb)) { 512 qdisc_drop(skb, sch, to_free); 513 skb = NULL; 514 goto finish_segs; 515 } 516 517 skb->data[get_random_u32_below(skb_headlen(skb))] ^= 518 1<<get_random_u32_below(8); 519 } 520 521 if (unlikely(sch->q.qlen >= sch->limit)) { 522 /* re-link segs, so that qdisc_drop_all() frees them all */ 523 skb->next = segs; 524 qdisc_drop_all(skb, sch, to_free); 525 return rc_drop; 526 } 527 528 qdisc_qstats_backlog_inc(sch, skb); 529 530 cb = netem_skb_cb(skb); 531 if (q->gap == 0 || /* not doing reordering */ 532 q->counter < q->gap - 1 || /* inside last reordering gap */ 533 q->reorder < get_crandom(&q->reorder_cor)) { 534 u64 now; 535 s64 delay; 536 537 delay = tabledist(q->latency, q->jitter, 538 &q->delay_cor, q->delay_dist); 539 540 now = ktime_get_ns(); 541 542 if (q->rate) { 543 struct netem_skb_cb *last = NULL; 544 545 if (sch->q.tail) 546 last = netem_skb_cb(sch->q.tail); 547 if (q->t_root.rb_node) { 548 struct sk_buff *t_skb; 549 struct netem_skb_cb *t_last; 550 551 t_skb = skb_rb_last(&q->t_root); 552 t_last = netem_skb_cb(t_skb); 553 if (!last || 554 t_last->time_to_send > last->time_to_send) 555 last = t_last; 556 } 557 if (q->t_tail) { 558 struct netem_skb_cb *t_last = 559 netem_skb_cb(q->t_tail); 560 561 if (!last || 562 t_last->time_to_send > last->time_to_send) 563 last = t_last; 564 } 565 566 if (last) { 567 /* 568 * Last packet in queue is reference point (now), 569 * calculate this time bonus and subtract 570 * from delay. 571 */ 572 delay -= last->time_to_send - now; 573 delay = max_t(s64, 0, delay); 574 now = last->time_to_send; 575 } 576 577 delay += packet_time_ns(qdisc_pkt_len(skb), q); 578 } 579 580 cb->time_to_send = now + delay; 581 ++q->counter; 582 tfifo_enqueue(skb, sch); 583 } else { 584 /* 585 * Do re-ordering by putting one out of N packets at the front 586 * of the queue. 587 */ 588 cb->time_to_send = ktime_get_ns(); 589 q->counter = 0; 590 591 __qdisc_enqueue_head(skb, &sch->q); 592 sch->qstats.requeues++; 593 } 594 595 finish_segs: 596 if (segs) { 597 unsigned int len, last_len; 598 int nb; 599 600 len = skb ? skb->len : 0; 601 nb = skb ? 1 : 0; 602 603 while (segs) { 604 skb2 = segs->next; 605 skb_mark_not_on_list(segs); 606 qdisc_skb_cb(segs)->pkt_len = segs->len; 607 last_len = segs->len; 608 rc = qdisc_enqueue(segs, sch, to_free); 609 if (rc != NET_XMIT_SUCCESS) { 610 if (net_xmit_drop_count(rc)) 611 qdisc_qstats_drop(sch); 612 } else { 613 nb++; 614 len += last_len; 615 } 616 segs = skb2; 617 } 618 /* Parent qdiscs accounted for 1 skb of size @prev_len */ 619 qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len)); 620 } else if (!skb) { 621 return NET_XMIT_DROP; 622 } 623 return NET_XMIT_SUCCESS; 624 } 625 626 /* Delay the next round with a new future slot with a 627 * correct number of bytes and packets. 628 */ 629 630 static void get_slot_next(struct netem_sched_data *q, u64 now) 631 { 632 s64 next_delay; 633 634 if (!q->slot_dist) 635 next_delay = q->slot_config.min_delay + 636 (get_random_u32() * 637 (q->slot_config.max_delay - 638 q->slot_config.min_delay) >> 32); 639 else 640 next_delay = tabledist(q->slot_config.dist_delay, 641 (s32)(q->slot_config.dist_jitter), 642 NULL, q->slot_dist); 643 644 q->slot.slot_next = now + next_delay; 645 q->slot.packets_left = q->slot_config.max_packets; 646 q->slot.bytes_left = q->slot_config.max_bytes; 647 } 648 649 static struct sk_buff *netem_peek(struct netem_sched_data *q) 650 { 651 struct sk_buff *skb = skb_rb_first(&q->t_root); 652 u64 t1, t2; 653 654 if (!skb) 655 return q->t_head; 656 if (!q->t_head) 657 return skb; 658 659 t1 = netem_skb_cb(skb)->time_to_send; 660 t2 = netem_skb_cb(q->t_head)->time_to_send; 661 if (t1 < t2) 662 return skb; 663 return q->t_head; 664 } 665 666 static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb) 667 { 668 if (skb == q->t_head) { 669 q->t_head = skb->next; 670 if (!q->t_head) 671 q->t_tail = NULL; 672 } else { 673 rb_erase(&skb->rbnode, &q->t_root); 674 } 675 } 676 677 static struct sk_buff *netem_dequeue(struct Qdisc *sch) 678 { 679 struct netem_sched_data *q = qdisc_priv(sch); 680 struct sk_buff *skb; 681 682 tfifo_dequeue: 683 skb = __qdisc_dequeue_head(&sch->q); 684 if (skb) { 685 qdisc_qstats_backlog_dec(sch, skb); 686 deliver: 687 qdisc_bstats_update(sch, skb); 688 return skb; 689 } 690 skb = netem_peek(q); 691 if (skb) { 692 u64 time_to_send; 693 u64 now = ktime_get_ns(); 694 695 /* if more time remaining? */ 696 time_to_send = netem_skb_cb(skb)->time_to_send; 697 if (q->slot.slot_next && q->slot.slot_next < time_to_send) 698 get_slot_next(q, now); 699 700 if (time_to_send <= now && q->slot.slot_next <= now) { 701 netem_erase_head(q, skb); 702 sch->q.qlen--; 703 qdisc_qstats_backlog_dec(sch, skb); 704 skb->next = NULL; 705 skb->prev = NULL; 706 /* skb->dev shares skb->rbnode area, 707 * we need to restore its value. 708 */ 709 skb->dev = qdisc_dev(sch); 710 711 if (q->slot.slot_next) { 712 q->slot.packets_left--; 713 q->slot.bytes_left -= qdisc_pkt_len(skb); 714 if (q->slot.packets_left <= 0 || 715 q->slot.bytes_left <= 0) 716 get_slot_next(q, now); 717 } 718 719 if (q->qdisc) { 720 unsigned int pkt_len = qdisc_pkt_len(skb); 721 struct sk_buff *to_free = NULL; 722 int err; 723 724 err = qdisc_enqueue(skb, q->qdisc, &to_free); 725 kfree_skb_list(to_free); 726 if (err != NET_XMIT_SUCCESS && 727 net_xmit_drop_count(err)) { 728 qdisc_qstats_drop(sch); 729 qdisc_tree_reduce_backlog(sch, 1, 730 pkt_len); 731 } 732 goto tfifo_dequeue; 733 } 734 goto deliver; 735 } 736 737 if (q->qdisc) { 738 skb = q->qdisc->ops->dequeue(q->qdisc); 739 if (skb) 740 goto deliver; 741 } 742 743 qdisc_watchdog_schedule_ns(&q->watchdog, 744 max(time_to_send, 745 q->slot.slot_next)); 746 } 747 748 if (q->qdisc) { 749 skb = q->qdisc->ops->dequeue(q->qdisc); 750 if (skb) 751 goto deliver; 752 } 753 return NULL; 754 } 755 756 static void netem_reset(struct Qdisc *sch) 757 { 758 struct netem_sched_data *q = qdisc_priv(sch); 759 760 qdisc_reset_queue(sch); 761 tfifo_reset(sch); 762 if (q->qdisc) 763 qdisc_reset(q->qdisc); 764 qdisc_watchdog_cancel(&q->watchdog); 765 } 766 767 static void dist_free(struct disttable *d) 768 { 769 kvfree(d); 770 } 771 772 /* 773 * Distribution data is a variable size payload containing 774 * signed 16 bit values. 775 */ 776 777 static int get_dist_table(struct disttable **tbl, const struct nlattr *attr) 778 { 779 size_t n = nla_len(attr)/sizeof(__s16); 780 const __s16 *data = nla_data(attr); 781 struct disttable *d; 782 int i; 783 784 if (!n || n > NETEM_DIST_MAX) 785 return -EINVAL; 786 787 d = kvmalloc(struct_size(d, table, n), GFP_KERNEL); 788 if (!d) 789 return -ENOMEM; 790 791 d->size = n; 792 for (i = 0; i < n; i++) 793 d->table[i] = data[i]; 794 795 *tbl = d; 796 return 0; 797 } 798 799 static void get_slot(struct netem_sched_data *q, const struct nlattr *attr) 800 { 801 const struct tc_netem_slot *c = nla_data(attr); 802 803 q->slot_config = *c; 804 if (q->slot_config.max_packets == 0) 805 q->slot_config.max_packets = INT_MAX; 806 if (q->slot_config.max_bytes == 0) 807 q->slot_config.max_bytes = INT_MAX; 808 809 /* capping dist_jitter to the range acceptable by tabledist() */ 810 q->slot_config.dist_jitter = min_t(__s64, INT_MAX, abs(q->slot_config.dist_jitter)); 811 812 q->slot.packets_left = q->slot_config.max_packets; 813 q->slot.bytes_left = q->slot_config.max_bytes; 814 if (q->slot_config.min_delay | q->slot_config.max_delay | 815 q->slot_config.dist_jitter) 816 q->slot.slot_next = ktime_get_ns(); 817 else 818 q->slot.slot_next = 0; 819 } 820 821 static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr) 822 { 823 const struct tc_netem_corr *c = nla_data(attr); 824 825 init_crandom(&q->delay_cor, c->delay_corr); 826 init_crandom(&q->loss_cor, c->loss_corr); 827 init_crandom(&q->dup_cor, c->dup_corr); 828 } 829 830 static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr) 831 { 832 const struct tc_netem_reorder *r = nla_data(attr); 833 834 q->reorder = r->probability; 835 init_crandom(&q->reorder_cor, r->correlation); 836 } 837 838 static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr) 839 { 840 const struct tc_netem_corrupt *r = nla_data(attr); 841 842 q->corrupt = r->probability; 843 init_crandom(&q->corrupt_cor, r->correlation); 844 } 845 846 static void get_rate(struct netem_sched_data *q, const struct nlattr *attr) 847 { 848 const struct tc_netem_rate *r = nla_data(attr); 849 850 q->rate = r->rate; 851 q->packet_overhead = r->packet_overhead; 852 q->cell_size = r->cell_size; 853 q->cell_overhead = r->cell_overhead; 854 if (q->cell_size) 855 q->cell_size_reciprocal = reciprocal_value(q->cell_size); 856 else 857 q->cell_size_reciprocal = (struct reciprocal_value) { 0 }; 858 } 859 860 static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr) 861 { 862 const struct nlattr *la; 863 int rem; 864 865 nla_for_each_nested(la, attr, rem) { 866 u16 type = nla_type(la); 867 868 switch (type) { 869 case NETEM_LOSS_GI: { 870 const struct tc_netem_gimodel *gi = nla_data(la); 871 872 if (nla_len(la) < sizeof(struct tc_netem_gimodel)) { 873 pr_info("netem: incorrect gi model size\n"); 874 return -EINVAL; 875 } 876 877 q->loss_model = CLG_4_STATES; 878 879 q->clg.state = TX_IN_GAP_PERIOD; 880 q->clg.a1 = gi->p13; 881 q->clg.a2 = gi->p31; 882 q->clg.a3 = gi->p32; 883 q->clg.a4 = gi->p14; 884 q->clg.a5 = gi->p23; 885 break; 886 } 887 888 case NETEM_LOSS_GE: { 889 const struct tc_netem_gemodel *ge = nla_data(la); 890 891 if (nla_len(la) < sizeof(struct tc_netem_gemodel)) { 892 pr_info("netem: incorrect ge model size\n"); 893 return -EINVAL; 894 } 895 896 q->loss_model = CLG_GILB_ELL; 897 q->clg.state = GOOD_STATE; 898 q->clg.a1 = ge->p; 899 q->clg.a2 = ge->r; 900 q->clg.a3 = ge->h; 901 q->clg.a4 = ge->k1; 902 break; 903 } 904 905 default: 906 pr_info("netem: unknown loss type %u\n", type); 907 return -EINVAL; 908 } 909 } 910 911 return 0; 912 } 913 914 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = { 915 [TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) }, 916 [TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) }, 917 [TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) }, 918 [TCA_NETEM_RATE] = { .len = sizeof(struct tc_netem_rate) }, 919 [TCA_NETEM_LOSS] = { .type = NLA_NESTED }, 920 [TCA_NETEM_ECN] = { .type = NLA_U32 }, 921 [TCA_NETEM_RATE64] = { .type = NLA_U64 }, 922 [TCA_NETEM_LATENCY64] = { .type = NLA_S64 }, 923 [TCA_NETEM_JITTER64] = { .type = NLA_S64 }, 924 [TCA_NETEM_SLOT] = { .len = sizeof(struct tc_netem_slot) }, 925 }; 926 927 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, 928 const struct nla_policy *policy, int len) 929 { 930 int nested_len = nla_len(nla) - NLA_ALIGN(len); 931 932 if (nested_len < 0) { 933 pr_info("netem: invalid attributes len %d\n", nested_len); 934 return -EINVAL; 935 } 936 937 if (nested_len >= nla_attr_size(0)) 938 return nla_parse_deprecated(tb, maxtype, 939 nla_data(nla) + NLA_ALIGN(len), 940 nested_len, policy, NULL); 941 942 memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); 943 return 0; 944 } 945 946 /* Parse netlink message to set options */ 947 static int netem_change(struct Qdisc *sch, struct nlattr *opt, 948 struct netlink_ext_ack *extack) 949 { 950 struct netem_sched_data *q = qdisc_priv(sch); 951 struct nlattr *tb[TCA_NETEM_MAX + 1]; 952 struct disttable *delay_dist = NULL; 953 struct disttable *slot_dist = NULL; 954 struct tc_netem_qopt *qopt; 955 struct clgstate old_clg; 956 int old_loss_model = CLG_RANDOM; 957 int ret; 958 959 qopt = nla_data(opt); 960 ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt)); 961 if (ret < 0) 962 return ret; 963 964 if (tb[TCA_NETEM_DELAY_DIST]) { 965 ret = get_dist_table(&delay_dist, tb[TCA_NETEM_DELAY_DIST]); 966 if (ret) 967 goto table_free; 968 } 969 970 if (tb[TCA_NETEM_SLOT_DIST]) { 971 ret = get_dist_table(&slot_dist, tb[TCA_NETEM_SLOT_DIST]); 972 if (ret) 973 goto table_free; 974 } 975 976 sch_tree_lock(sch); 977 /* backup q->clg and q->loss_model */ 978 old_clg = q->clg; 979 old_loss_model = q->loss_model; 980 981 if (tb[TCA_NETEM_LOSS]) { 982 ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]); 983 if (ret) { 984 q->loss_model = old_loss_model; 985 q->clg = old_clg; 986 goto unlock; 987 } 988 } else { 989 q->loss_model = CLG_RANDOM; 990 } 991 992 if (delay_dist) 993 swap(q->delay_dist, delay_dist); 994 if (slot_dist) 995 swap(q->slot_dist, slot_dist); 996 sch->limit = qopt->limit; 997 998 q->latency = PSCHED_TICKS2NS(qopt->latency); 999 q->jitter = PSCHED_TICKS2NS(qopt->jitter); 1000 q->limit = qopt->limit; 1001 q->gap = qopt->gap; 1002 q->counter = 0; 1003 q->loss = qopt->loss; 1004 q->duplicate = qopt->duplicate; 1005 1006 /* for compatibility with earlier versions. 1007 * if gap is set, need to assume 100% probability 1008 */ 1009 if (q->gap) 1010 q->reorder = ~0; 1011 1012 if (tb[TCA_NETEM_CORR]) 1013 get_correlation(q, tb[TCA_NETEM_CORR]); 1014 1015 if (tb[TCA_NETEM_REORDER]) 1016 get_reorder(q, tb[TCA_NETEM_REORDER]); 1017 1018 if (tb[TCA_NETEM_CORRUPT]) 1019 get_corrupt(q, tb[TCA_NETEM_CORRUPT]); 1020 1021 if (tb[TCA_NETEM_RATE]) 1022 get_rate(q, tb[TCA_NETEM_RATE]); 1023 1024 if (tb[TCA_NETEM_RATE64]) 1025 q->rate = max_t(u64, q->rate, 1026 nla_get_u64(tb[TCA_NETEM_RATE64])); 1027 1028 if (tb[TCA_NETEM_LATENCY64]) 1029 q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]); 1030 1031 if (tb[TCA_NETEM_JITTER64]) 1032 q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]); 1033 1034 if (tb[TCA_NETEM_ECN]) 1035 q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]); 1036 1037 if (tb[TCA_NETEM_SLOT]) 1038 get_slot(q, tb[TCA_NETEM_SLOT]); 1039 1040 /* capping jitter to the range acceptable by tabledist() */ 1041 q->jitter = min_t(s64, abs(q->jitter), INT_MAX); 1042 1043 unlock: 1044 sch_tree_unlock(sch); 1045 1046 table_free: 1047 dist_free(delay_dist); 1048 dist_free(slot_dist); 1049 return ret; 1050 } 1051 1052 static int netem_init(struct Qdisc *sch, struct nlattr *opt, 1053 struct netlink_ext_ack *extack) 1054 { 1055 struct netem_sched_data *q = qdisc_priv(sch); 1056 int ret; 1057 1058 qdisc_watchdog_init(&q->watchdog, sch); 1059 1060 if (!opt) 1061 return -EINVAL; 1062 1063 q->loss_model = CLG_RANDOM; 1064 ret = netem_change(sch, opt, extack); 1065 if (ret) 1066 pr_info("netem: change failed\n"); 1067 return ret; 1068 } 1069 1070 static void netem_destroy(struct Qdisc *sch) 1071 { 1072 struct netem_sched_data *q = qdisc_priv(sch); 1073 1074 qdisc_watchdog_cancel(&q->watchdog); 1075 if (q->qdisc) 1076 qdisc_put(q->qdisc); 1077 dist_free(q->delay_dist); 1078 dist_free(q->slot_dist); 1079 } 1080 1081 static int dump_loss_model(const struct netem_sched_data *q, 1082 struct sk_buff *skb) 1083 { 1084 struct nlattr *nest; 1085 1086 nest = nla_nest_start_noflag(skb, TCA_NETEM_LOSS); 1087 if (nest == NULL) 1088 goto nla_put_failure; 1089 1090 switch (q->loss_model) { 1091 case CLG_RANDOM: 1092 /* legacy loss model */ 1093 nla_nest_cancel(skb, nest); 1094 return 0; /* no data */ 1095 1096 case CLG_4_STATES: { 1097 struct tc_netem_gimodel gi = { 1098 .p13 = q->clg.a1, 1099 .p31 = q->clg.a2, 1100 .p32 = q->clg.a3, 1101 .p14 = q->clg.a4, 1102 .p23 = q->clg.a5, 1103 }; 1104 1105 if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi)) 1106 goto nla_put_failure; 1107 break; 1108 } 1109 case CLG_GILB_ELL: { 1110 struct tc_netem_gemodel ge = { 1111 .p = q->clg.a1, 1112 .r = q->clg.a2, 1113 .h = q->clg.a3, 1114 .k1 = q->clg.a4, 1115 }; 1116 1117 if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge)) 1118 goto nla_put_failure; 1119 break; 1120 } 1121 } 1122 1123 nla_nest_end(skb, nest); 1124 return 0; 1125 1126 nla_put_failure: 1127 nla_nest_cancel(skb, nest); 1128 return -1; 1129 } 1130 1131 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) 1132 { 1133 const struct netem_sched_data *q = qdisc_priv(sch); 1134 struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb); 1135 struct tc_netem_qopt qopt; 1136 struct tc_netem_corr cor; 1137 struct tc_netem_reorder reorder; 1138 struct tc_netem_corrupt corrupt; 1139 struct tc_netem_rate rate; 1140 struct tc_netem_slot slot; 1141 1142 qopt.latency = min_t(psched_time_t, PSCHED_NS2TICKS(q->latency), 1143 UINT_MAX); 1144 qopt.jitter = min_t(psched_time_t, PSCHED_NS2TICKS(q->jitter), 1145 UINT_MAX); 1146 qopt.limit = q->limit; 1147 qopt.loss = q->loss; 1148 qopt.gap = q->gap; 1149 qopt.duplicate = q->duplicate; 1150 if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt)) 1151 goto nla_put_failure; 1152 1153 if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency)) 1154 goto nla_put_failure; 1155 1156 if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter)) 1157 goto nla_put_failure; 1158 1159 cor.delay_corr = q->delay_cor.rho; 1160 cor.loss_corr = q->loss_cor.rho; 1161 cor.dup_corr = q->dup_cor.rho; 1162 if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor)) 1163 goto nla_put_failure; 1164 1165 reorder.probability = q->reorder; 1166 reorder.correlation = q->reorder_cor.rho; 1167 if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder)) 1168 goto nla_put_failure; 1169 1170 corrupt.probability = q->corrupt; 1171 corrupt.correlation = q->corrupt_cor.rho; 1172 if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt)) 1173 goto nla_put_failure; 1174 1175 if (q->rate >= (1ULL << 32)) { 1176 if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate, 1177 TCA_NETEM_PAD)) 1178 goto nla_put_failure; 1179 rate.rate = ~0U; 1180 } else { 1181 rate.rate = q->rate; 1182 } 1183 rate.packet_overhead = q->packet_overhead; 1184 rate.cell_size = q->cell_size; 1185 rate.cell_overhead = q->cell_overhead; 1186 if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate)) 1187 goto nla_put_failure; 1188 1189 if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn)) 1190 goto nla_put_failure; 1191 1192 if (dump_loss_model(q, skb) != 0) 1193 goto nla_put_failure; 1194 1195 if (q->slot_config.min_delay | q->slot_config.max_delay | 1196 q->slot_config.dist_jitter) { 1197 slot = q->slot_config; 1198 if (slot.max_packets == INT_MAX) 1199 slot.max_packets = 0; 1200 if (slot.max_bytes == INT_MAX) 1201 slot.max_bytes = 0; 1202 if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot)) 1203 goto nla_put_failure; 1204 } 1205 1206 return nla_nest_end(skb, nla); 1207 1208 nla_put_failure: 1209 nlmsg_trim(skb, nla); 1210 return -1; 1211 } 1212 1213 static int netem_dump_class(struct Qdisc *sch, unsigned long cl, 1214 struct sk_buff *skb, struct tcmsg *tcm) 1215 { 1216 struct netem_sched_data *q = qdisc_priv(sch); 1217 1218 if (cl != 1 || !q->qdisc) /* only one class */ 1219 return -ENOENT; 1220 1221 tcm->tcm_handle |= TC_H_MIN(1); 1222 tcm->tcm_info = q->qdisc->handle; 1223 1224 return 0; 1225 } 1226 1227 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, 1228 struct Qdisc **old, struct netlink_ext_ack *extack) 1229 { 1230 struct netem_sched_data *q = qdisc_priv(sch); 1231 1232 *old = qdisc_replace(sch, new, &q->qdisc); 1233 return 0; 1234 } 1235 1236 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg) 1237 { 1238 struct netem_sched_data *q = qdisc_priv(sch); 1239 return q->qdisc; 1240 } 1241 1242 static unsigned long netem_find(struct Qdisc *sch, u32 classid) 1243 { 1244 return 1; 1245 } 1246 1247 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker) 1248 { 1249 if (!walker->stop) { 1250 if (!tc_qdisc_stats_dump(sch, 1, walker)) 1251 return; 1252 } 1253 } 1254 1255 static const struct Qdisc_class_ops netem_class_ops = { 1256 .graft = netem_graft, 1257 .leaf = netem_leaf, 1258 .find = netem_find, 1259 .walk = netem_walk, 1260 .dump = netem_dump_class, 1261 }; 1262 1263 static struct Qdisc_ops netem_qdisc_ops __read_mostly = { 1264 .id = "netem", 1265 .cl_ops = &netem_class_ops, 1266 .priv_size = sizeof(struct netem_sched_data), 1267 .enqueue = netem_enqueue, 1268 .dequeue = netem_dequeue, 1269 .peek = qdisc_peek_dequeued, 1270 .init = netem_init, 1271 .reset = netem_reset, 1272 .destroy = netem_destroy, 1273 .change = netem_change, 1274 .dump = netem_dump, 1275 .owner = THIS_MODULE, 1276 }; 1277 1278 1279 static int __init netem_module_init(void) 1280 { 1281 pr_info("netem: version " VERSION "\n"); 1282 return register_qdisc(&netem_qdisc_ops); 1283 } 1284 static void __exit netem_module_exit(void) 1285 { 1286 unregister_qdisc(&netem_qdisc_ops); 1287 } 1288 module_init(netem_module_init) 1289 module_exit(netem_module_exit) 1290 MODULE_LICENSE("GPL"); 1291