1 /* 2 * net/sched/sch_netem.c Network emulator 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License. 8 * 9 * Many of the algorithms and ideas for this came from 10 * NIST Net which is not copyrighted. 11 * 12 * Authors: Stephen Hemminger <shemminger@osdl.org> 13 * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro> 14 */ 15 16 #include <linux/mm.h> 17 #include <linux/module.h> 18 #include <linux/slab.h> 19 #include <linux/types.h> 20 #include <linux/kernel.h> 21 #include <linux/errno.h> 22 #include <linux/skbuff.h> 23 #include <linux/vmalloc.h> 24 #include <linux/rtnetlink.h> 25 #include <linux/reciprocal_div.h> 26 #include <linux/rbtree.h> 27 28 #include <net/netlink.h> 29 #include <net/pkt_sched.h> 30 #include <net/inet_ecn.h> 31 32 #define VERSION "1.3" 33 34 /* Network Emulation Queuing algorithm. 35 ==================================== 36 37 Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based 38 Network Emulation Tool 39 [2] Luigi Rizzo, DummyNet for FreeBSD 40 41 ---------------------------------------------------------------- 42 43 This started out as a simple way to delay outgoing packets to 44 test TCP but has grown to include most of the functionality 45 of a full blown network emulator like NISTnet. It can delay 46 packets and add random jitter (and correlation). The random 47 distribution can be loaded from a table as well to provide 48 normal, Pareto, or experimental curves. Packet loss, 49 duplication, and reordering can also be emulated. 50 51 This qdisc does not do classification that can be handled in 52 layering other disciplines. It does not need to do bandwidth 53 control either since that can be handled by using token 54 bucket or other rate control. 55 56 Correlated Loss Generator models 57 58 Added generation of correlated loss according to the 59 "Gilbert-Elliot" model, a 4-state markov model. 60 61 References: 62 [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG 63 [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general 64 and intuitive loss model for packet networks and its implementation 65 in the Netem module in the Linux kernel", available in [1] 66 67 Authors: Stefano Salsano <stefano.salsano at uniroma2.it 68 Fabio Ludovici <fabio.ludovici at yahoo.it> 69 */ 70 71 struct netem_sched_data { 72 /* internal t(ime)fifo qdisc uses t_root and sch->limit */ 73 struct rb_root t_root; 74 75 /* optional qdisc for classful handling (NULL at netem init) */ 76 struct Qdisc *qdisc; 77 78 struct qdisc_watchdog watchdog; 79 80 psched_tdiff_t latency; 81 psched_tdiff_t jitter; 82 83 u32 loss; 84 u32 ecn; 85 u32 limit; 86 u32 counter; 87 u32 gap; 88 u32 duplicate; 89 u32 reorder; 90 u32 corrupt; 91 u64 rate; 92 s32 packet_overhead; 93 u32 cell_size; 94 struct reciprocal_value cell_size_reciprocal; 95 s32 cell_overhead; 96 97 struct crndstate { 98 u32 last; 99 u32 rho; 100 } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor; 101 102 struct disttable { 103 u32 size; 104 s16 table[0]; 105 } *delay_dist; 106 107 enum { 108 CLG_RANDOM, 109 CLG_4_STATES, 110 CLG_GILB_ELL, 111 } loss_model; 112 113 enum { 114 TX_IN_GAP_PERIOD = 1, 115 TX_IN_BURST_PERIOD, 116 LOST_IN_GAP_PERIOD, 117 LOST_IN_BURST_PERIOD, 118 } _4_state_model; 119 120 enum { 121 GOOD_STATE = 1, 122 BAD_STATE, 123 } GE_state_model; 124 125 /* Correlated Loss Generation models */ 126 struct clgstate { 127 /* state of the Markov chain */ 128 u8 state; 129 130 /* 4-states and Gilbert-Elliot models */ 131 u32 a1; /* p13 for 4-states or p for GE */ 132 u32 a2; /* p31 for 4-states or r for GE */ 133 u32 a3; /* p32 for 4-states or h for GE */ 134 u32 a4; /* p14 for 4-states or 1-k for GE */ 135 u32 a5; /* p23 used only in 4-states */ 136 } clg; 137 138 }; 139 140 /* Time stamp put into socket buffer control block 141 * Only valid when skbs are in our internal t(ime)fifo queue. 142 * 143 * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp, 144 * and skb->next & skb->prev are scratch space for a qdisc, 145 * we save skb->tstamp value in skb->cb[] before destroying it. 146 */ 147 struct netem_skb_cb { 148 psched_time_t time_to_send; 149 }; 150 151 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) 152 { 153 /* we assume we can use skb next/prev/tstamp as storage for rb_node */ 154 qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb)); 155 return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data; 156 } 157 158 /* init_crandom - initialize correlated random number generator 159 * Use entropy source for initial seed. 160 */ 161 static void init_crandom(struct crndstate *state, unsigned long rho) 162 { 163 state->rho = rho; 164 state->last = prandom_u32(); 165 } 166 167 /* get_crandom - correlated random number generator 168 * Next number depends on last value. 169 * rho is scaled to avoid floating point. 170 */ 171 static u32 get_crandom(struct crndstate *state) 172 { 173 u64 value, rho; 174 unsigned long answer; 175 176 if (state->rho == 0) /* no correlation */ 177 return prandom_u32(); 178 179 value = prandom_u32(); 180 rho = (u64)state->rho + 1; 181 answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32; 182 state->last = answer; 183 return answer; 184 } 185 186 /* loss_4state - 4-state model loss generator 187 * Generates losses according to the 4-state Markov chain adopted in 188 * the GI (General and Intuitive) loss model. 189 */ 190 static bool loss_4state(struct netem_sched_data *q) 191 { 192 struct clgstate *clg = &q->clg; 193 u32 rnd = prandom_u32(); 194 195 /* 196 * Makes a comparison between rnd and the transition 197 * probabilities outgoing from the current state, then decides the 198 * next state and if the next packet has to be transmitted or lost. 199 * The four states correspond to: 200 * TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period 201 * LOST_IN_BURST_PERIOD => isolated losses within a gap period 202 * LOST_IN_GAP_PERIOD => lost packets within a burst period 203 * TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period 204 */ 205 switch (clg->state) { 206 case TX_IN_GAP_PERIOD: 207 if (rnd < clg->a4) { 208 clg->state = LOST_IN_BURST_PERIOD; 209 return true; 210 } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) { 211 clg->state = LOST_IN_GAP_PERIOD; 212 return true; 213 } else if (clg->a1 + clg->a4 < rnd) { 214 clg->state = TX_IN_GAP_PERIOD; 215 } 216 217 break; 218 case TX_IN_BURST_PERIOD: 219 if (rnd < clg->a5) { 220 clg->state = LOST_IN_GAP_PERIOD; 221 return true; 222 } else { 223 clg->state = TX_IN_BURST_PERIOD; 224 } 225 226 break; 227 case LOST_IN_GAP_PERIOD: 228 if (rnd < clg->a3) 229 clg->state = TX_IN_BURST_PERIOD; 230 else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) { 231 clg->state = TX_IN_GAP_PERIOD; 232 } else if (clg->a2 + clg->a3 < rnd) { 233 clg->state = LOST_IN_GAP_PERIOD; 234 return true; 235 } 236 break; 237 case LOST_IN_BURST_PERIOD: 238 clg->state = TX_IN_GAP_PERIOD; 239 break; 240 } 241 242 return false; 243 } 244 245 /* loss_gilb_ell - Gilbert-Elliot model loss generator 246 * Generates losses according to the Gilbert-Elliot loss model or 247 * its special cases (Gilbert or Simple Gilbert) 248 * 249 * Makes a comparison between random number and the transition 250 * probabilities outgoing from the current state, then decides the 251 * next state. A second random number is extracted and the comparison 252 * with the loss probability of the current state decides if the next 253 * packet will be transmitted or lost. 254 */ 255 static bool loss_gilb_ell(struct netem_sched_data *q) 256 { 257 struct clgstate *clg = &q->clg; 258 259 switch (clg->state) { 260 case GOOD_STATE: 261 if (prandom_u32() < clg->a1) 262 clg->state = BAD_STATE; 263 if (prandom_u32() < clg->a4) 264 return true; 265 break; 266 case BAD_STATE: 267 if (prandom_u32() < clg->a2) 268 clg->state = GOOD_STATE; 269 if (prandom_u32() > clg->a3) 270 return true; 271 } 272 273 return false; 274 } 275 276 static bool loss_event(struct netem_sched_data *q) 277 { 278 switch (q->loss_model) { 279 case CLG_RANDOM: 280 /* Random packet drop 0 => none, ~0 => all */ 281 return q->loss && q->loss >= get_crandom(&q->loss_cor); 282 283 case CLG_4_STATES: 284 /* 4state loss model algorithm (used also for GI model) 285 * Extracts a value from the markov 4 state loss generator, 286 * if it is 1 drops a packet and if needed writes the event in 287 * the kernel logs 288 */ 289 return loss_4state(q); 290 291 case CLG_GILB_ELL: 292 /* Gilbert-Elliot loss model algorithm 293 * Extracts a value from the Gilbert-Elliot loss generator, 294 * if it is 1 drops a packet and if needed writes the event in 295 * the kernel logs 296 */ 297 return loss_gilb_ell(q); 298 } 299 300 return false; /* not reached */ 301 } 302 303 304 /* tabledist - return a pseudo-randomly distributed value with mean mu and 305 * std deviation sigma. Uses table lookup to approximate the desired 306 * distribution, and a uniformly-distributed pseudo-random source. 307 */ 308 static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma, 309 struct crndstate *state, 310 const struct disttable *dist) 311 { 312 psched_tdiff_t x; 313 long t; 314 u32 rnd; 315 316 if (sigma == 0) 317 return mu; 318 319 rnd = get_crandom(state); 320 321 /* default uniform distribution */ 322 if (dist == NULL) 323 return (rnd % (2*sigma)) - sigma + mu; 324 325 t = dist->table[rnd % dist->size]; 326 x = (sigma % NETEM_DIST_SCALE) * t; 327 if (x >= 0) 328 x += NETEM_DIST_SCALE/2; 329 else 330 x -= NETEM_DIST_SCALE/2; 331 332 return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu; 333 } 334 335 static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q) 336 { 337 u64 ticks; 338 339 len += q->packet_overhead; 340 341 if (q->cell_size) { 342 u32 cells = reciprocal_divide(len, q->cell_size_reciprocal); 343 344 if (len > cells * q->cell_size) /* extra cell needed for remainder */ 345 cells++; 346 len = cells * (q->cell_size + q->cell_overhead); 347 } 348 349 ticks = (u64)len * NSEC_PER_SEC; 350 351 do_div(ticks, q->rate); 352 return PSCHED_NS2TICKS(ticks); 353 } 354 355 static void tfifo_reset(struct Qdisc *sch) 356 { 357 struct netem_sched_data *q = qdisc_priv(sch); 358 struct rb_node *p = rb_first(&q->t_root); 359 360 while (p) { 361 struct sk_buff *skb = rb_to_skb(p); 362 363 p = rb_next(p); 364 rb_erase(&skb->rbnode, &q->t_root); 365 rtnl_kfree_skbs(skb, skb); 366 } 367 } 368 369 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) 370 { 371 struct netem_sched_data *q = qdisc_priv(sch); 372 psched_time_t tnext = netem_skb_cb(nskb)->time_to_send; 373 struct rb_node **p = &q->t_root.rb_node, *parent = NULL; 374 375 while (*p) { 376 struct sk_buff *skb; 377 378 parent = *p; 379 skb = rb_to_skb(parent); 380 if (tnext >= netem_skb_cb(skb)->time_to_send) 381 p = &parent->rb_right; 382 else 383 p = &parent->rb_left; 384 } 385 rb_link_node(&nskb->rbnode, parent, p); 386 rb_insert_color(&nskb->rbnode, &q->t_root); 387 sch->q.qlen++; 388 } 389 390 /* netem can't properly corrupt a megapacket (like we get from GSO), so instead 391 * when we statistically choose to corrupt one, we instead segment it, returning 392 * the first packet to be corrupted, and re-enqueue the remaining frames 393 */ 394 static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch, 395 struct sk_buff **to_free) 396 { 397 struct sk_buff *segs; 398 netdev_features_t features = netif_skb_features(skb); 399 400 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 401 402 if (IS_ERR_OR_NULL(segs)) { 403 qdisc_drop(skb, sch, to_free); 404 return NULL; 405 } 406 consume_skb(skb); 407 return segs; 408 } 409 410 static void netem_enqueue_skb_head(struct qdisc_skb_head *qh, struct sk_buff *skb) 411 { 412 skb->next = qh->head; 413 414 if (!qh->head) 415 qh->tail = skb; 416 qh->head = skb; 417 qh->qlen++; 418 } 419 420 /* 421 * Insert one skb into qdisc. 422 * Note: parent depends on return value to account for queue length. 423 * NET_XMIT_DROP: queue length didn't change. 424 * NET_XMIT_SUCCESS: one skb was queued. 425 */ 426 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, 427 struct sk_buff **to_free) 428 { 429 struct netem_sched_data *q = qdisc_priv(sch); 430 /* We don't fill cb now as skb_unshare() may invalidate it */ 431 struct netem_skb_cb *cb; 432 struct sk_buff *skb2; 433 struct sk_buff *segs = NULL; 434 unsigned int len = 0, last_len, prev_len = qdisc_pkt_len(skb); 435 int nb = 0; 436 int count = 1; 437 int rc = NET_XMIT_SUCCESS; 438 439 /* Random duplication */ 440 if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) 441 ++count; 442 443 /* Drop packet? */ 444 if (loss_event(q)) { 445 if (q->ecn && INET_ECN_set_ce(skb)) 446 qdisc_qstats_drop(sch); /* mark packet */ 447 else 448 --count; 449 } 450 if (count == 0) { 451 qdisc_qstats_drop(sch); 452 __qdisc_drop(skb, to_free); 453 return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; 454 } 455 456 /* If a delay is expected, orphan the skb. (orphaning usually takes 457 * place at TX completion time, so _before_ the link transit delay) 458 */ 459 if (q->latency || q->jitter || q->rate) 460 skb_orphan_partial(skb); 461 462 /* 463 * If we need to duplicate packet, then re-insert at top of the 464 * qdisc tree, since parent queuer expects that only one 465 * skb will be queued. 466 */ 467 if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) { 468 struct Qdisc *rootq = qdisc_root(sch); 469 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ 470 471 q->duplicate = 0; 472 rootq->enqueue(skb2, rootq, to_free); 473 q->duplicate = dupsave; 474 } 475 476 /* 477 * Randomized packet corruption. 478 * Make copy if needed since we are modifying 479 * If packet is going to be hardware checksummed, then 480 * do it now in software before we mangle it. 481 */ 482 if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) { 483 if (skb_is_gso(skb)) { 484 segs = netem_segment(skb, sch, to_free); 485 if (!segs) 486 return NET_XMIT_DROP; 487 } else { 488 segs = skb; 489 } 490 491 skb = segs; 492 segs = segs->next; 493 494 skb = skb_unshare(skb, GFP_ATOMIC); 495 if (unlikely(!skb)) { 496 qdisc_qstats_drop(sch); 497 goto finish_segs; 498 } 499 if (skb->ip_summed == CHECKSUM_PARTIAL && 500 skb_checksum_help(skb)) { 501 qdisc_drop(skb, sch, to_free); 502 goto finish_segs; 503 } 504 505 skb->data[prandom_u32() % skb_headlen(skb)] ^= 506 1<<(prandom_u32() % 8); 507 } 508 509 if (unlikely(sch->q.qlen >= sch->limit)) 510 return qdisc_drop(skb, sch, to_free); 511 512 qdisc_qstats_backlog_inc(sch, skb); 513 514 cb = netem_skb_cb(skb); 515 if (q->gap == 0 || /* not doing reordering */ 516 q->counter < q->gap - 1 || /* inside last reordering gap */ 517 q->reorder < get_crandom(&q->reorder_cor)) { 518 psched_time_t now; 519 psched_tdiff_t delay; 520 521 delay = tabledist(q->latency, q->jitter, 522 &q->delay_cor, q->delay_dist); 523 524 now = psched_get_time(); 525 526 if (q->rate) { 527 struct netem_skb_cb *last = NULL; 528 529 if (sch->q.tail) 530 last = netem_skb_cb(sch->q.tail); 531 if (q->t_root.rb_node) { 532 struct sk_buff *t_skb; 533 struct netem_skb_cb *t_last; 534 535 t_skb = skb_rb_last(&q->t_root); 536 t_last = netem_skb_cb(t_skb); 537 if (!last || 538 t_last->time_to_send > last->time_to_send) { 539 last = t_last; 540 } 541 } 542 543 if (last) { 544 /* 545 * Last packet in queue is reference point (now), 546 * calculate this time bonus and subtract 547 * from delay. 548 */ 549 delay -= last->time_to_send - now; 550 delay = max_t(psched_tdiff_t, 0, delay); 551 now = last->time_to_send; 552 } 553 554 delay += packet_len_2_sched_time(qdisc_pkt_len(skb), q); 555 } 556 557 cb->time_to_send = now + delay; 558 ++q->counter; 559 tfifo_enqueue(skb, sch); 560 } else { 561 /* 562 * Do re-ordering by putting one out of N packets at the front 563 * of the queue. 564 */ 565 cb->time_to_send = psched_get_time(); 566 q->counter = 0; 567 568 netem_enqueue_skb_head(&sch->q, skb); 569 sch->qstats.requeues++; 570 } 571 572 finish_segs: 573 if (segs) { 574 while (segs) { 575 skb2 = segs->next; 576 segs->next = NULL; 577 qdisc_skb_cb(segs)->pkt_len = segs->len; 578 last_len = segs->len; 579 rc = qdisc_enqueue(segs, sch, to_free); 580 if (rc != NET_XMIT_SUCCESS) { 581 if (net_xmit_drop_count(rc)) 582 qdisc_qstats_drop(sch); 583 } else { 584 nb++; 585 len += last_len; 586 } 587 segs = skb2; 588 } 589 sch->q.qlen += nb; 590 if (nb > 1) 591 qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len); 592 } 593 return NET_XMIT_SUCCESS; 594 } 595 596 static struct sk_buff *netem_dequeue(struct Qdisc *sch) 597 { 598 struct netem_sched_data *q = qdisc_priv(sch); 599 struct sk_buff *skb; 600 struct rb_node *p; 601 602 tfifo_dequeue: 603 skb = __qdisc_dequeue_head(&sch->q); 604 if (skb) { 605 qdisc_qstats_backlog_dec(sch, skb); 606 deliver: 607 qdisc_bstats_update(sch, skb); 608 return skb; 609 } 610 p = rb_first(&q->t_root); 611 if (p) { 612 psched_time_t time_to_send; 613 614 skb = rb_to_skb(p); 615 616 /* if more time remaining? */ 617 time_to_send = netem_skb_cb(skb)->time_to_send; 618 if (time_to_send <= psched_get_time()) { 619 rb_erase(p, &q->t_root); 620 621 sch->q.qlen--; 622 qdisc_qstats_backlog_dec(sch, skb); 623 skb->next = NULL; 624 skb->prev = NULL; 625 /* skb->dev shares skb->rbnode area, 626 * we need to restore its value. 627 */ 628 skb->dev = qdisc_dev(sch); 629 630 #ifdef CONFIG_NET_CLS_ACT 631 /* 632 * If it's at ingress let's pretend the delay is 633 * from the network (tstamp will be updated). 634 */ 635 if (skb->tc_redirected && skb->tc_from_ingress) 636 skb->tstamp = 0; 637 #endif 638 639 if (q->qdisc) { 640 unsigned int pkt_len = qdisc_pkt_len(skb); 641 struct sk_buff *to_free = NULL; 642 int err; 643 644 err = qdisc_enqueue(skb, q->qdisc, &to_free); 645 kfree_skb_list(to_free); 646 if (err != NET_XMIT_SUCCESS && 647 net_xmit_drop_count(err)) { 648 qdisc_qstats_drop(sch); 649 qdisc_tree_reduce_backlog(sch, 1, 650 pkt_len); 651 } 652 goto tfifo_dequeue; 653 } 654 goto deliver; 655 } 656 657 if (q->qdisc) { 658 skb = q->qdisc->ops->dequeue(q->qdisc); 659 if (skb) 660 goto deliver; 661 } 662 qdisc_watchdog_schedule(&q->watchdog, time_to_send); 663 } 664 665 if (q->qdisc) { 666 skb = q->qdisc->ops->dequeue(q->qdisc); 667 if (skb) 668 goto deliver; 669 } 670 return NULL; 671 } 672 673 static void netem_reset(struct Qdisc *sch) 674 { 675 struct netem_sched_data *q = qdisc_priv(sch); 676 677 qdisc_reset_queue(sch); 678 tfifo_reset(sch); 679 if (q->qdisc) 680 qdisc_reset(q->qdisc); 681 qdisc_watchdog_cancel(&q->watchdog); 682 } 683 684 static void dist_free(struct disttable *d) 685 { 686 kvfree(d); 687 } 688 689 /* 690 * Distribution data is a variable size payload containing 691 * signed 16 bit values. 692 */ 693 static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) 694 { 695 struct netem_sched_data *q = qdisc_priv(sch); 696 size_t n = nla_len(attr)/sizeof(__s16); 697 const __s16 *data = nla_data(attr); 698 spinlock_t *root_lock; 699 struct disttable *d; 700 int i; 701 702 if (n > NETEM_DIST_MAX) 703 return -EINVAL; 704 705 d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL); 706 if (!d) 707 return -ENOMEM; 708 709 d->size = n; 710 for (i = 0; i < n; i++) 711 d->table[i] = data[i]; 712 713 root_lock = qdisc_root_sleeping_lock(sch); 714 715 spin_lock_bh(root_lock); 716 swap(q->delay_dist, d); 717 spin_unlock_bh(root_lock); 718 719 dist_free(d); 720 return 0; 721 } 722 723 static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr) 724 { 725 const struct tc_netem_corr *c = nla_data(attr); 726 727 init_crandom(&q->delay_cor, c->delay_corr); 728 init_crandom(&q->loss_cor, c->loss_corr); 729 init_crandom(&q->dup_cor, c->dup_corr); 730 } 731 732 static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr) 733 { 734 const struct tc_netem_reorder *r = nla_data(attr); 735 736 q->reorder = r->probability; 737 init_crandom(&q->reorder_cor, r->correlation); 738 } 739 740 static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr) 741 { 742 const struct tc_netem_corrupt *r = nla_data(attr); 743 744 q->corrupt = r->probability; 745 init_crandom(&q->corrupt_cor, r->correlation); 746 } 747 748 static void get_rate(struct netem_sched_data *q, const struct nlattr *attr) 749 { 750 const struct tc_netem_rate *r = nla_data(attr); 751 752 q->rate = r->rate; 753 q->packet_overhead = r->packet_overhead; 754 q->cell_size = r->cell_size; 755 q->cell_overhead = r->cell_overhead; 756 if (q->cell_size) 757 q->cell_size_reciprocal = reciprocal_value(q->cell_size); 758 else 759 q->cell_size_reciprocal = (struct reciprocal_value) { 0 }; 760 } 761 762 static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr) 763 { 764 const struct nlattr *la; 765 int rem; 766 767 nla_for_each_nested(la, attr, rem) { 768 u16 type = nla_type(la); 769 770 switch (type) { 771 case NETEM_LOSS_GI: { 772 const struct tc_netem_gimodel *gi = nla_data(la); 773 774 if (nla_len(la) < sizeof(struct tc_netem_gimodel)) { 775 pr_info("netem: incorrect gi model size\n"); 776 return -EINVAL; 777 } 778 779 q->loss_model = CLG_4_STATES; 780 781 q->clg.state = TX_IN_GAP_PERIOD; 782 q->clg.a1 = gi->p13; 783 q->clg.a2 = gi->p31; 784 q->clg.a3 = gi->p32; 785 q->clg.a4 = gi->p14; 786 q->clg.a5 = gi->p23; 787 break; 788 } 789 790 case NETEM_LOSS_GE: { 791 const struct tc_netem_gemodel *ge = nla_data(la); 792 793 if (nla_len(la) < sizeof(struct tc_netem_gemodel)) { 794 pr_info("netem: incorrect ge model size\n"); 795 return -EINVAL; 796 } 797 798 q->loss_model = CLG_GILB_ELL; 799 q->clg.state = GOOD_STATE; 800 q->clg.a1 = ge->p; 801 q->clg.a2 = ge->r; 802 q->clg.a3 = ge->h; 803 q->clg.a4 = ge->k1; 804 break; 805 } 806 807 default: 808 pr_info("netem: unknown loss type %u\n", type); 809 return -EINVAL; 810 } 811 } 812 813 return 0; 814 } 815 816 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = { 817 [TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) }, 818 [TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) }, 819 [TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) }, 820 [TCA_NETEM_RATE] = { .len = sizeof(struct tc_netem_rate) }, 821 [TCA_NETEM_LOSS] = { .type = NLA_NESTED }, 822 [TCA_NETEM_ECN] = { .type = NLA_U32 }, 823 [TCA_NETEM_RATE64] = { .type = NLA_U64 }, 824 }; 825 826 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, 827 const struct nla_policy *policy, int len) 828 { 829 int nested_len = nla_len(nla) - NLA_ALIGN(len); 830 831 if (nested_len < 0) { 832 pr_info("netem: invalid attributes len %d\n", nested_len); 833 return -EINVAL; 834 } 835 836 if (nested_len >= nla_attr_size(0)) 837 return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len), 838 nested_len, policy, NULL); 839 840 memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); 841 return 0; 842 } 843 844 /* Parse netlink message to set options */ 845 static int netem_change(struct Qdisc *sch, struct nlattr *opt) 846 { 847 struct netem_sched_data *q = qdisc_priv(sch); 848 struct nlattr *tb[TCA_NETEM_MAX + 1]; 849 struct tc_netem_qopt *qopt; 850 struct clgstate old_clg; 851 int old_loss_model = CLG_RANDOM; 852 int ret; 853 854 if (opt == NULL) 855 return -EINVAL; 856 857 qopt = nla_data(opt); 858 ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt)); 859 if (ret < 0) 860 return ret; 861 862 /* backup q->clg and q->loss_model */ 863 old_clg = q->clg; 864 old_loss_model = q->loss_model; 865 866 if (tb[TCA_NETEM_LOSS]) { 867 ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]); 868 if (ret) { 869 q->loss_model = old_loss_model; 870 return ret; 871 } 872 } else { 873 q->loss_model = CLG_RANDOM; 874 } 875 876 if (tb[TCA_NETEM_DELAY_DIST]) { 877 ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]); 878 if (ret) { 879 /* recover clg and loss_model, in case of 880 * q->clg and q->loss_model were modified 881 * in get_loss_clg() 882 */ 883 q->clg = old_clg; 884 q->loss_model = old_loss_model; 885 return ret; 886 } 887 } 888 889 sch->limit = qopt->limit; 890 891 q->latency = qopt->latency; 892 q->jitter = qopt->jitter; 893 q->limit = qopt->limit; 894 q->gap = qopt->gap; 895 q->counter = 0; 896 q->loss = qopt->loss; 897 q->duplicate = qopt->duplicate; 898 899 /* for compatibility with earlier versions. 900 * if gap is set, need to assume 100% probability 901 */ 902 if (q->gap) 903 q->reorder = ~0; 904 905 if (tb[TCA_NETEM_CORR]) 906 get_correlation(q, tb[TCA_NETEM_CORR]); 907 908 if (tb[TCA_NETEM_REORDER]) 909 get_reorder(q, tb[TCA_NETEM_REORDER]); 910 911 if (tb[TCA_NETEM_CORRUPT]) 912 get_corrupt(q, tb[TCA_NETEM_CORRUPT]); 913 914 if (tb[TCA_NETEM_RATE]) 915 get_rate(q, tb[TCA_NETEM_RATE]); 916 917 if (tb[TCA_NETEM_RATE64]) 918 q->rate = max_t(u64, q->rate, 919 nla_get_u64(tb[TCA_NETEM_RATE64])); 920 921 if (tb[TCA_NETEM_ECN]) 922 q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]); 923 924 return ret; 925 } 926 927 static int netem_init(struct Qdisc *sch, struct nlattr *opt) 928 { 929 struct netem_sched_data *q = qdisc_priv(sch); 930 int ret; 931 932 qdisc_watchdog_init(&q->watchdog, sch); 933 934 if (!opt) 935 return -EINVAL; 936 937 q->loss_model = CLG_RANDOM; 938 ret = netem_change(sch, opt); 939 if (ret) 940 pr_info("netem: change failed\n"); 941 return ret; 942 } 943 944 static void netem_destroy(struct Qdisc *sch) 945 { 946 struct netem_sched_data *q = qdisc_priv(sch); 947 948 qdisc_watchdog_cancel(&q->watchdog); 949 if (q->qdisc) 950 qdisc_destroy(q->qdisc); 951 dist_free(q->delay_dist); 952 } 953 954 static int dump_loss_model(const struct netem_sched_data *q, 955 struct sk_buff *skb) 956 { 957 struct nlattr *nest; 958 959 nest = nla_nest_start(skb, TCA_NETEM_LOSS); 960 if (nest == NULL) 961 goto nla_put_failure; 962 963 switch (q->loss_model) { 964 case CLG_RANDOM: 965 /* legacy loss model */ 966 nla_nest_cancel(skb, nest); 967 return 0; /* no data */ 968 969 case CLG_4_STATES: { 970 struct tc_netem_gimodel gi = { 971 .p13 = q->clg.a1, 972 .p31 = q->clg.a2, 973 .p32 = q->clg.a3, 974 .p14 = q->clg.a4, 975 .p23 = q->clg.a5, 976 }; 977 978 if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi)) 979 goto nla_put_failure; 980 break; 981 } 982 case CLG_GILB_ELL: { 983 struct tc_netem_gemodel ge = { 984 .p = q->clg.a1, 985 .r = q->clg.a2, 986 .h = q->clg.a3, 987 .k1 = q->clg.a4, 988 }; 989 990 if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge)) 991 goto nla_put_failure; 992 break; 993 } 994 } 995 996 nla_nest_end(skb, nest); 997 return 0; 998 999 nla_put_failure: 1000 nla_nest_cancel(skb, nest); 1001 return -1; 1002 } 1003 1004 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) 1005 { 1006 const struct netem_sched_data *q = qdisc_priv(sch); 1007 struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb); 1008 struct tc_netem_qopt qopt; 1009 struct tc_netem_corr cor; 1010 struct tc_netem_reorder reorder; 1011 struct tc_netem_corrupt corrupt; 1012 struct tc_netem_rate rate; 1013 1014 qopt.latency = q->latency; 1015 qopt.jitter = q->jitter; 1016 qopt.limit = q->limit; 1017 qopt.loss = q->loss; 1018 qopt.gap = q->gap; 1019 qopt.duplicate = q->duplicate; 1020 if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt)) 1021 goto nla_put_failure; 1022 1023 cor.delay_corr = q->delay_cor.rho; 1024 cor.loss_corr = q->loss_cor.rho; 1025 cor.dup_corr = q->dup_cor.rho; 1026 if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor)) 1027 goto nla_put_failure; 1028 1029 reorder.probability = q->reorder; 1030 reorder.correlation = q->reorder_cor.rho; 1031 if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder)) 1032 goto nla_put_failure; 1033 1034 corrupt.probability = q->corrupt; 1035 corrupt.correlation = q->corrupt_cor.rho; 1036 if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt)) 1037 goto nla_put_failure; 1038 1039 if (q->rate >= (1ULL << 32)) { 1040 if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate, 1041 TCA_NETEM_PAD)) 1042 goto nla_put_failure; 1043 rate.rate = ~0U; 1044 } else { 1045 rate.rate = q->rate; 1046 } 1047 rate.packet_overhead = q->packet_overhead; 1048 rate.cell_size = q->cell_size; 1049 rate.cell_overhead = q->cell_overhead; 1050 if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate)) 1051 goto nla_put_failure; 1052 1053 if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn)) 1054 goto nla_put_failure; 1055 1056 if (dump_loss_model(q, skb) != 0) 1057 goto nla_put_failure; 1058 1059 return nla_nest_end(skb, nla); 1060 1061 nla_put_failure: 1062 nlmsg_trim(skb, nla); 1063 return -1; 1064 } 1065 1066 static int netem_dump_class(struct Qdisc *sch, unsigned long cl, 1067 struct sk_buff *skb, struct tcmsg *tcm) 1068 { 1069 struct netem_sched_data *q = qdisc_priv(sch); 1070 1071 if (cl != 1 || !q->qdisc) /* only one class */ 1072 return -ENOENT; 1073 1074 tcm->tcm_handle |= TC_H_MIN(1); 1075 tcm->tcm_info = q->qdisc->handle; 1076 1077 return 0; 1078 } 1079 1080 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, 1081 struct Qdisc **old) 1082 { 1083 struct netem_sched_data *q = qdisc_priv(sch); 1084 1085 *old = qdisc_replace(sch, new, &q->qdisc); 1086 return 0; 1087 } 1088 1089 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg) 1090 { 1091 struct netem_sched_data *q = qdisc_priv(sch); 1092 return q->qdisc; 1093 } 1094 1095 static unsigned long netem_find(struct Qdisc *sch, u32 classid) 1096 { 1097 return 1; 1098 } 1099 1100 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker) 1101 { 1102 if (!walker->stop) { 1103 if (walker->count >= walker->skip) 1104 if (walker->fn(sch, 1, walker) < 0) { 1105 walker->stop = 1; 1106 return; 1107 } 1108 walker->count++; 1109 } 1110 } 1111 1112 static const struct Qdisc_class_ops netem_class_ops = { 1113 .graft = netem_graft, 1114 .leaf = netem_leaf, 1115 .find = netem_find, 1116 .walk = netem_walk, 1117 .dump = netem_dump_class, 1118 }; 1119 1120 static struct Qdisc_ops netem_qdisc_ops __read_mostly = { 1121 .id = "netem", 1122 .cl_ops = &netem_class_ops, 1123 .priv_size = sizeof(struct netem_sched_data), 1124 .enqueue = netem_enqueue, 1125 .dequeue = netem_dequeue, 1126 .peek = qdisc_peek_dequeued, 1127 .init = netem_init, 1128 .reset = netem_reset, 1129 .destroy = netem_destroy, 1130 .change = netem_change, 1131 .dump = netem_dump, 1132 .owner = THIS_MODULE, 1133 }; 1134 1135 1136 static int __init netem_module_init(void) 1137 { 1138 pr_info("netem: version " VERSION "\n"); 1139 return register_qdisc(&netem_qdisc_ops); 1140 } 1141 static void __exit netem_module_exit(void) 1142 { 1143 unregister_qdisc(&netem_qdisc_ops); 1144 } 1145 module_init(netem_module_init) 1146 module_exit(netem_module_exit) 1147 MODULE_LICENSE("GPL"); 1148