1 /* 2 * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing) 3 * 4 * Copyright (C) 2013-2015 Eric Dumazet <edumazet@google.com> 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 * 11 * Meant to be mostly used for locally generated traffic : 12 * Fast classification depends on skb->sk being set before reaching us. 13 * If not, (router workload), we use rxhash as fallback, with 32 bits wide hash. 14 * All packets belonging to a socket are considered as a 'flow'. 15 * 16 * Flows are dynamically allocated and stored in a hash table of RB trees 17 * They are also part of one Round Robin 'queues' (new or old flows) 18 * 19 * Burst avoidance (aka pacing) capability : 20 * 21 * Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a 22 * bunch of packets, and this packet scheduler adds delay between 23 * packets to respect rate limitation. 24 * 25 * enqueue() : 26 * - lookup one RB tree (out of 1024 or more) to find the flow. 27 * If non existent flow, create it, add it to the tree. 28 * Add skb to the per flow list of skb (fifo). 29 * - Use a special fifo for high prio packets 30 * 31 * dequeue() : serves flows in Round Robin 32 * Note : When a flow becomes empty, we do not immediately remove it from 33 * rb trees, for performance reasons (its expected to send additional packets, 34 * or SLAB cache will reuse socket for another flow) 35 */ 36 37 #include <linux/module.h> 38 #include <linux/types.h> 39 #include <linux/kernel.h> 40 #include <linux/jiffies.h> 41 #include <linux/string.h> 42 #include <linux/in.h> 43 #include <linux/errno.h> 44 #include <linux/init.h> 45 #include <linux/skbuff.h> 46 #include <linux/slab.h> 47 #include <linux/rbtree.h> 48 #include <linux/hash.h> 49 #include <linux/prefetch.h> 50 #include <linux/vmalloc.h> 51 #include <net/netlink.h> 52 #include <net/pkt_sched.h> 53 #include <net/sock.h> 54 #include <net/tcp_states.h> 55 #include <net/tcp.h> 56 57 /* 58 * Per flow structure, dynamically allocated 59 */ 60 struct fq_flow { 61 struct sk_buff *head; /* list of skbs for this flow : first skb */ 62 union { 63 struct sk_buff *tail; /* last skb in the list */ 64 unsigned long age; /* jiffies when flow was emptied, for gc */ 65 }; 66 struct rb_node fq_node; /* anchor in fq_root[] trees */ 67 struct sock *sk; 68 int qlen; /* number of packets in flow queue */ 69 int credit; 70 u32 socket_hash; /* sk_hash */ 71 struct fq_flow *next; /* next pointer in RR lists, or &detached */ 72 73 struct rb_node rate_node; /* anchor in q->delayed tree */ 74 u64 time_next_packet; 75 }; 76 77 struct fq_flow_head { 78 struct fq_flow *first; 79 struct fq_flow *last; 80 }; 81 82 struct fq_sched_data { 83 struct fq_flow_head new_flows; 84 85 struct fq_flow_head old_flows; 86 87 struct rb_root delayed; /* for rate limited flows */ 88 u64 time_next_delayed_flow; 89 90 struct fq_flow internal; /* for non classified or high prio packets */ 91 u32 quantum; 92 u32 initial_quantum; 93 u32 flow_refill_delay; 94 u32 flow_max_rate; /* optional max rate per flow */ 95 u32 flow_plimit; /* max packets per flow */ 96 u32 orphan_mask; /* mask for orphaned skb */ 97 struct rb_root *fq_root; 98 u8 rate_enable; 99 u8 fq_trees_log; 100 101 u32 flows; 102 u32 inactive_flows; 103 u32 throttled_flows; 104 105 u64 stat_gc_flows; 106 u64 stat_internal_packets; 107 u64 stat_tcp_retrans; 108 u64 stat_throttled; 109 u64 stat_flows_plimit; 110 u64 stat_pkts_too_long; 111 u64 stat_allocation_errors; 112 struct qdisc_watchdog watchdog; 113 }; 114 115 /* special value to mark a detached flow (not on old/new list) */ 116 static struct fq_flow detached, throttled; 117 118 static void fq_flow_set_detached(struct fq_flow *f) 119 { 120 f->next = &detached; 121 f->age = jiffies; 122 } 123 124 static bool fq_flow_is_detached(const struct fq_flow *f) 125 { 126 return f->next == &detached; 127 } 128 129 static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f) 130 { 131 struct rb_node **p = &q->delayed.rb_node, *parent = NULL; 132 133 while (*p) { 134 struct fq_flow *aux; 135 136 parent = *p; 137 aux = container_of(parent, struct fq_flow, rate_node); 138 if (f->time_next_packet >= aux->time_next_packet) 139 p = &parent->rb_right; 140 else 141 p = &parent->rb_left; 142 } 143 rb_link_node(&f->rate_node, parent, p); 144 rb_insert_color(&f->rate_node, &q->delayed); 145 q->throttled_flows++; 146 q->stat_throttled++; 147 148 f->next = &throttled; 149 if (q->time_next_delayed_flow > f->time_next_packet) 150 q->time_next_delayed_flow = f->time_next_packet; 151 } 152 153 154 static struct kmem_cache *fq_flow_cachep __read_mostly; 155 156 static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow) 157 { 158 if (head->first) 159 head->last->next = flow; 160 else 161 head->first = flow; 162 head->last = flow; 163 flow->next = NULL; 164 } 165 166 /* limit number of collected flows per round */ 167 #define FQ_GC_MAX 8 168 #define FQ_GC_AGE (3*HZ) 169 170 static bool fq_gc_candidate(const struct fq_flow *f) 171 { 172 return fq_flow_is_detached(f) && 173 time_after(jiffies, f->age + FQ_GC_AGE); 174 } 175 176 static void fq_gc(struct fq_sched_data *q, 177 struct rb_root *root, 178 struct sock *sk) 179 { 180 struct fq_flow *f, *tofree[FQ_GC_MAX]; 181 struct rb_node **p, *parent; 182 int fcnt = 0; 183 184 p = &root->rb_node; 185 parent = NULL; 186 while (*p) { 187 parent = *p; 188 189 f = container_of(parent, struct fq_flow, fq_node); 190 if (f->sk == sk) 191 break; 192 193 if (fq_gc_candidate(f)) { 194 tofree[fcnt++] = f; 195 if (fcnt == FQ_GC_MAX) 196 break; 197 } 198 199 if (f->sk > sk) 200 p = &parent->rb_right; 201 else 202 p = &parent->rb_left; 203 } 204 205 q->flows -= fcnt; 206 q->inactive_flows -= fcnt; 207 q->stat_gc_flows += fcnt; 208 while (fcnt) { 209 struct fq_flow *f = tofree[--fcnt]; 210 211 rb_erase(&f->fq_node, root); 212 kmem_cache_free(fq_flow_cachep, f); 213 } 214 } 215 216 static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) 217 { 218 struct rb_node **p, *parent; 219 struct sock *sk = skb->sk; 220 struct rb_root *root; 221 struct fq_flow *f; 222 223 /* warning: no starvation prevention... */ 224 if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) 225 return &q->internal; 226 227 /* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket 228 * or a listener (SYNCOOKIE mode) 229 * 1) request sockets are not full blown, 230 * they do not contain sk_pacing_rate 231 * 2) They are not part of a 'flow' yet 232 * 3) We do not want to rate limit them (eg SYNFLOOD attack), 233 * especially if the listener set SO_MAX_PACING_RATE 234 * 4) We pretend they are orphaned 235 */ 236 if (!sk || sk_listener(sk)) { 237 unsigned long hash = skb_get_hash(skb) & q->orphan_mask; 238 239 /* By forcing low order bit to 1, we make sure to not 240 * collide with a local flow (socket pointers are word aligned) 241 */ 242 sk = (struct sock *)((hash << 1) | 1UL); 243 skb_orphan(skb); 244 } 245 246 root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)]; 247 248 if (q->flows >= (2U << q->fq_trees_log) && 249 q->inactive_flows > q->flows/2) 250 fq_gc(q, root, sk); 251 252 p = &root->rb_node; 253 parent = NULL; 254 while (*p) { 255 parent = *p; 256 257 f = container_of(parent, struct fq_flow, fq_node); 258 if (f->sk == sk) { 259 /* socket might have been reallocated, so check 260 * if its sk_hash is the same. 261 * It not, we need to refill credit with 262 * initial quantum 263 */ 264 if (unlikely(skb->sk && 265 f->socket_hash != sk->sk_hash)) { 266 f->credit = q->initial_quantum; 267 f->socket_hash = sk->sk_hash; 268 f->time_next_packet = 0ULL; 269 } 270 return f; 271 } 272 if (f->sk > sk) 273 p = &parent->rb_right; 274 else 275 p = &parent->rb_left; 276 } 277 278 f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN); 279 if (unlikely(!f)) { 280 q->stat_allocation_errors++; 281 return &q->internal; 282 } 283 fq_flow_set_detached(f); 284 f->sk = sk; 285 if (skb->sk) 286 f->socket_hash = sk->sk_hash; 287 f->credit = q->initial_quantum; 288 289 rb_link_node(&f->fq_node, parent, p); 290 rb_insert_color(&f->fq_node, root); 291 292 q->flows++; 293 q->inactive_flows++; 294 return f; 295 } 296 297 298 /* remove one skb from head of flow queue */ 299 static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow) 300 { 301 struct sk_buff *skb = flow->head; 302 303 if (skb) { 304 flow->head = skb->next; 305 skb->next = NULL; 306 flow->qlen--; 307 qdisc_qstats_backlog_dec(sch, skb); 308 sch->q.qlen--; 309 } 310 return skb; 311 } 312 313 /* We might add in the future detection of retransmits 314 * For the time being, just return false 315 */ 316 static bool skb_is_retransmit(struct sk_buff *skb) 317 { 318 return false; 319 } 320 321 /* add skb to flow queue 322 * flow queue is a linked list, kind of FIFO, except for TCP retransmits 323 * We special case tcp retransmits to be transmitted before other packets. 324 * We rely on fact that TCP retransmits are unlikely, so we do not waste 325 * a separate queue or a pointer. 326 * head-> [retrans pkt 1] 327 * [retrans pkt 2] 328 * [ normal pkt 1] 329 * [ normal pkt 2] 330 * [ normal pkt 3] 331 * tail-> [ normal pkt 4] 332 */ 333 static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) 334 { 335 struct sk_buff *prev, *head = flow->head; 336 337 skb->next = NULL; 338 if (!head) { 339 flow->head = skb; 340 flow->tail = skb; 341 return; 342 } 343 if (likely(!skb_is_retransmit(skb))) { 344 flow->tail->next = skb; 345 flow->tail = skb; 346 return; 347 } 348 349 /* This skb is a tcp retransmit, 350 * find the last retrans packet in the queue 351 */ 352 prev = NULL; 353 while (skb_is_retransmit(head)) { 354 prev = head; 355 head = head->next; 356 if (!head) 357 break; 358 } 359 if (!prev) { /* no rtx packet in queue, become the new head */ 360 skb->next = flow->head; 361 flow->head = skb; 362 } else { 363 if (prev == flow->tail) 364 flow->tail = skb; 365 else 366 skb->next = prev->next; 367 prev->next = skb; 368 } 369 } 370 371 static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, 372 struct sk_buff **to_free) 373 { 374 struct fq_sched_data *q = qdisc_priv(sch); 375 struct fq_flow *f; 376 377 if (unlikely(sch->q.qlen >= sch->limit)) 378 return qdisc_drop(skb, sch, to_free); 379 380 f = fq_classify(skb, q); 381 if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) { 382 q->stat_flows_plimit++; 383 return qdisc_drop(skb, sch, to_free); 384 } 385 386 f->qlen++; 387 if (skb_is_retransmit(skb)) 388 q->stat_tcp_retrans++; 389 qdisc_qstats_backlog_inc(sch, skb); 390 if (fq_flow_is_detached(f)) { 391 fq_flow_add_tail(&q->new_flows, f); 392 if (time_after(jiffies, f->age + q->flow_refill_delay)) 393 f->credit = max_t(u32, f->credit, q->quantum); 394 q->inactive_flows--; 395 } 396 397 /* Note: this overwrites f->age */ 398 flow_queue_add(f, skb); 399 400 if (unlikely(f == &q->internal)) { 401 q->stat_internal_packets++; 402 } 403 sch->q.qlen++; 404 405 return NET_XMIT_SUCCESS; 406 } 407 408 static void fq_check_throttled(struct fq_sched_data *q, u64 now) 409 { 410 struct rb_node *p; 411 412 if (q->time_next_delayed_flow > now) 413 return; 414 415 q->time_next_delayed_flow = ~0ULL; 416 while ((p = rb_first(&q->delayed)) != NULL) { 417 struct fq_flow *f = container_of(p, struct fq_flow, rate_node); 418 419 if (f->time_next_packet > now) { 420 q->time_next_delayed_flow = f->time_next_packet; 421 break; 422 } 423 rb_erase(p, &q->delayed); 424 q->throttled_flows--; 425 fq_flow_add_tail(&q->old_flows, f); 426 } 427 } 428 429 static struct sk_buff *fq_dequeue(struct Qdisc *sch) 430 { 431 struct fq_sched_data *q = qdisc_priv(sch); 432 u64 now = ktime_get_ns(); 433 struct fq_flow_head *head; 434 struct sk_buff *skb; 435 struct fq_flow *f; 436 u32 rate; 437 438 skb = fq_dequeue_head(sch, &q->internal); 439 if (skb) 440 goto out; 441 fq_check_throttled(q, now); 442 begin: 443 head = &q->new_flows; 444 if (!head->first) { 445 head = &q->old_flows; 446 if (!head->first) { 447 if (q->time_next_delayed_flow != ~0ULL) 448 qdisc_watchdog_schedule_ns(&q->watchdog, 449 q->time_next_delayed_flow); 450 return NULL; 451 } 452 } 453 f = head->first; 454 455 if (f->credit <= 0) { 456 f->credit += q->quantum; 457 head->first = f->next; 458 fq_flow_add_tail(&q->old_flows, f); 459 goto begin; 460 } 461 462 skb = f->head; 463 if (unlikely(skb && now < f->time_next_packet && 464 !skb_is_tcp_pure_ack(skb))) { 465 head->first = f->next; 466 fq_flow_set_throttled(q, f); 467 goto begin; 468 } 469 470 skb = fq_dequeue_head(sch, f); 471 if (!skb) { 472 head->first = f->next; 473 /* force a pass through old_flows to prevent starvation */ 474 if ((head == &q->new_flows) && q->old_flows.first) { 475 fq_flow_add_tail(&q->old_flows, f); 476 } else { 477 fq_flow_set_detached(f); 478 q->inactive_flows++; 479 } 480 goto begin; 481 } 482 prefetch(&skb->end); 483 f->credit -= qdisc_pkt_len(skb); 484 485 if (f->credit > 0 || !q->rate_enable) 486 goto out; 487 488 /* Do not pace locally generated ack packets */ 489 if (skb_is_tcp_pure_ack(skb)) 490 goto out; 491 492 rate = q->flow_max_rate; 493 if (skb->sk) 494 rate = min(skb->sk->sk_pacing_rate, rate); 495 496 if (rate != ~0U) { 497 u32 plen = max(qdisc_pkt_len(skb), q->quantum); 498 u64 len = (u64)plen * NSEC_PER_SEC; 499 500 if (likely(rate)) 501 do_div(len, rate); 502 /* Since socket rate can change later, 503 * clamp the delay to 1 second. 504 * Really, providers of too big packets should be fixed ! 505 */ 506 if (unlikely(len > NSEC_PER_SEC)) { 507 len = NSEC_PER_SEC; 508 q->stat_pkts_too_long++; 509 } 510 511 f->time_next_packet = now + len; 512 } 513 out: 514 qdisc_bstats_update(sch, skb); 515 return skb; 516 } 517 518 static void fq_flow_purge(struct fq_flow *flow) 519 { 520 rtnl_kfree_skbs(flow->head, flow->tail); 521 flow->head = NULL; 522 flow->qlen = 0; 523 } 524 525 static void fq_reset(struct Qdisc *sch) 526 { 527 struct fq_sched_data *q = qdisc_priv(sch); 528 struct rb_root *root; 529 struct rb_node *p; 530 struct fq_flow *f; 531 unsigned int idx; 532 533 sch->q.qlen = 0; 534 sch->qstats.backlog = 0; 535 536 fq_flow_purge(&q->internal); 537 538 if (!q->fq_root) 539 return; 540 541 for (idx = 0; idx < (1U << q->fq_trees_log); idx++) { 542 root = &q->fq_root[idx]; 543 while ((p = rb_first(root)) != NULL) { 544 f = container_of(p, struct fq_flow, fq_node); 545 rb_erase(p, root); 546 547 fq_flow_purge(f); 548 549 kmem_cache_free(fq_flow_cachep, f); 550 } 551 } 552 q->new_flows.first = NULL; 553 q->old_flows.first = NULL; 554 q->delayed = RB_ROOT; 555 q->flows = 0; 556 q->inactive_flows = 0; 557 q->throttled_flows = 0; 558 } 559 560 static void fq_rehash(struct fq_sched_data *q, 561 struct rb_root *old_array, u32 old_log, 562 struct rb_root *new_array, u32 new_log) 563 { 564 struct rb_node *op, **np, *parent; 565 struct rb_root *oroot, *nroot; 566 struct fq_flow *of, *nf; 567 int fcnt = 0; 568 u32 idx; 569 570 for (idx = 0; idx < (1U << old_log); idx++) { 571 oroot = &old_array[idx]; 572 while ((op = rb_first(oroot)) != NULL) { 573 rb_erase(op, oroot); 574 of = container_of(op, struct fq_flow, fq_node); 575 if (fq_gc_candidate(of)) { 576 fcnt++; 577 kmem_cache_free(fq_flow_cachep, of); 578 continue; 579 } 580 nroot = &new_array[hash_32((u32)(long)of->sk, new_log)]; 581 582 np = &nroot->rb_node; 583 parent = NULL; 584 while (*np) { 585 parent = *np; 586 587 nf = container_of(parent, struct fq_flow, fq_node); 588 BUG_ON(nf->sk == of->sk); 589 590 if (nf->sk > of->sk) 591 np = &parent->rb_right; 592 else 593 np = &parent->rb_left; 594 } 595 596 rb_link_node(&of->fq_node, parent, np); 597 rb_insert_color(&of->fq_node, nroot); 598 } 599 } 600 q->flows -= fcnt; 601 q->inactive_flows -= fcnt; 602 q->stat_gc_flows += fcnt; 603 } 604 605 static void *fq_alloc_node(size_t sz, int node) 606 { 607 void *ptr; 608 609 ptr = kmalloc_node(sz, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN, node); 610 if (!ptr) 611 ptr = vmalloc_node(sz, node); 612 return ptr; 613 } 614 615 static void fq_free(void *addr) 616 { 617 kvfree(addr); 618 } 619 620 static int fq_resize(struct Qdisc *sch, u32 log) 621 { 622 struct fq_sched_data *q = qdisc_priv(sch); 623 struct rb_root *array; 624 void *old_fq_root; 625 u32 idx; 626 627 if (q->fq_root && log == q->fq_trees_log) 628 return 0; 629 630 /* If XPS was setup, we can allocate memory on right NUMA node */ 631 array = fq_alloc_node(sizeof(struct rb_root) << log, 632 netdev_queue_numa_node_read(sch->dev_queue)); 633 if (!array) 634 return -ENOMEM; 635 636 for (idx = 0; idx < (1U << log); idx++) 637 array[idx] = RB_ROOT; 638 639 sch_tree_lock(sch); 640 641 old_fq_root = q->fq_root; 642 if (old_fq_root) 643 fq_rehash(q, old_fq_root, q->fq_trees_log, array, log); 644 645 q->fq_root = array; 646 q->fq_trees_log = log; 647 648 sch_tree_unlock(sch); 649 650 fq_free(old_fq_root); 651 652 return 0; 653 } 654 655 static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { 656 [TCA_FQ_PLIMIT] = { .type = NLA_U32 }, 657 [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 }, 658 [TCA_FQ_QUANTUM] = { .type = NLA_U32 }, 659 [TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 }, 660 [TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 }, 661 [TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 }, 662 [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 }, 663 [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 }, 664 [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 }, 665 }; 666 667 static int fq_change(struct Qdisc *sch, struct nlattr *opt) 668 { 669 struct fq_sched_data *q = qdisc_priv(sch); 670 struct nlattr *tb[TCA_FQ_MAX + 1]; 671 int err, drop_count = 0; 672 unsigned drop_len = 0; 673 u32 fq_log; 674 675 if (!opt) 676 return -EINVAL; 677 678 err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy); 679 if (err < 0) 680 return err; 681 682 sch_tree_lock(sch); 683 684 fq_log = q->fq_trees_log; 685 686 if (tb[TCA_FQ_BUCKETS_LOG]) { 687 u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]); 688 689 if (nval >= 1 && nval <= ilog2(256*1024)) 690 fq_log = nval; 691 else 692 err = -EINVAL; 693 } 694 if (tb[TCA_FQ_PLIMIT]) 695 sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]); 696 697 if (tb[TCA_FQ_FLOW_PLIMIT]) 698 q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]); 699 700 if (tb[TCA_FQ_QUANTUM]) { 701 u32 quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]); 702 703 if (quantum > 0) 704 q->quantum = quantum; 705 else 706 err = -EINVAL; 707 } 708 709 if (tb[TCA_FQ_INITIAL_QUANTUM]) 710 q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]); 711 712 if (tb[TCA_FQ_FLOW_DEFAULT_RATE]) 713 pr_warn_ratelimited("sch_fq: defrate %u ignored.\n", 714 nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE])); 715 716 if (tb[TCA_FQ_FLOW_MAX_RATE]) 717 q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); 718 719 if (tb[TCA_FQ_RATE_ENABLE]) { 720 u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]); 721 722 if (enable <= 1) 723 q->rate_enable = enable; 724 else 725 err = -EINVAL; 726 } 727 728 if (tb[TCA_FQ_FLOW_REFILL_DELAY]) { 729 u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ; 730 731 q->flow_refill_delay = usecs_to_jiffies(usecs_delay); 732 } 733 734 if (tb[TCA_FQ_ORPHAN_MASK]) 735 q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]); 736 737 if (!err) { 738 sch_tree_unlock(sch); 739 err = fq_resize(sch, fq_log); 740 sch_tree_lock(sch); 741 } 742 while (sch->q.qlen > sch->limit) { 743 struct sk_buff *skb = fq_dequeue(sch); 744 745 if (!skb) 746 break; 747 drop_len += qdisc_pkt_len(skb); 748 rtnl_kfree_skbs(skb, skb); 749 drop_count++; 750 } 751 qdisc_tree_reduce_backlog(sch, drop_count, drop_len); 752 753 sch_tree_unlock(sch); 754 return err; 755 } 756 757 static void fq_destroy(struct Qdisc *sch) 758 { 759 struct fq_sched_data *q = qdisc_priv(sch); 760 761 fq_reset(sch); 762 fq_free(q->fq_root); 763 qdisc_watchdog_cancel(&q->watchdog); 764 } 765 766 static int fq_init(struct Qdisc *sch, struct nlattr *opt) 767 { 768 struct fq_sched_data *q = qdisc_priv(sch); 769 int err; 770 771 sch->limit = 10000; 772 q->flow_plimit = 100; 773 q->quantum = 2 * psched_mtu(qdisc_dev(sch)); 774 q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch)); 775 q->flow_refill_delay = msecs_to_jiffies(40); 776 q->flow_max_rate = ~0U; 777 q->rate_enable = 1; 778 q->new_flows.first = NULL; 779 q->old_flows.first = NULL; 780 q->delayed = RB_ROOT; 781 q->fq_root = NULL; 782 q->fq_trees_log = ilog2(1024); 783 q->orphan_mask = 1024 - 1; 784 qdisc_watchdog_init(&q->watchdog, sch); 785 786 if (opt) 787 err = fq_change(sch, opt); 788 else 789 err = fq_resize(sch, q->fq_trees_log); 790 791 return err; 792 } 793 794 static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) 795 { 796 struct fq_sched_data *q = qdisc_priv(sch); 797 struct nlattr *opts; 798 799 opts = nla_nest_start(skb, TCA_OPTIONS); 800 if (opts == NULL) 801 goto nla_put_failure; 802 803 /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */ 804 805 if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) || 806 nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) || 807 nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) || 808 nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) || 809 nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) || 810 nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) || 811 nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY, 812 jiffies_to_usecs(q->flow_refill_delay)) || 813 nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) || 814 nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log)) 815 goto nla_put_failure; 816 817 return nla_nest_end(skb, opts); 818 819 nla_put_failure: 820 return -1; 821 } 822 823 static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) 824 { 825 struct fq_sched_data *q = qdisc_priv(sch); 826 u64 now = ktime_get_ns(); 827 struct tc_fq_qd_stats st = { 828 .gc_flows = q->stat_gc_flows, 829 .highprio_packets = q->stat_internal_packets, 830 .tcp_retrans = q->stat_tcp_retrans, 831 .throttled = q->stat_throttled, 832 .flows_plimit = q->stat_flows_plimit, 833 .pkts_too_long = q->stat_pkts_too_long, 834 .allocation_errors = q->stat_allocation_errors, 835 .flows = q->flows, 836 .inactive_flows = q->inactive_flows, 837 .throttled_flows = q->throttled_flows, 838 .time_next_delayed_flow = q->time_next_delayed_flow - now, 839 }; 840 841 return gnet_stats_copy_app(d, &st, sizeof(st)); 842 } 843 844 static struct Qdisc_ops fq_qdisc_ops __read_mostly = { 845 .id = "fq", 846 .priv_size = sizeof(struct fq_sched_data), 847 848 .enqueue = fq_enqueue, 849 .dequeue = fq_dequeue, 850 .peek = qdisc_peek_dequeued, 851 .init = fq_init, 852 .reset = fq_reset, 853 .destroy = fq_destroy, 854 .change = fq_change, 855 .dump = fq_dump, 856 .dump_stats = fq_dump_stats, 857 .owner = THIS_MODULE, 858 }; 859 860 static int __init fq_module_init(void) 861 { 862 int ret; 863 864 fq_flow_cachep = kmem_cache_create("fq_flow_cache", 865 sizeof(struct fq_flow), 866 0, 0, NULL); 867 if (!fq_flow_cachep) 868 return -ENOMEM; 869 870 ret = register_qdisc(&fq_qdisc_ops); 871 if (ret) 872 kmem_cache_destroy(fq_flow_cachep); 873 return ret; 874 } 875 876 static void __exit fq_module_exit(void) 877 { 878 unregister_qdisc(&fq_qdisc_ops); 879 kmem_cache_destroy(fq_flow_cachep); 880 } 881 882 module_init(fq_module_init) 883 module_exit(fq_module_exit) 884 MODULE_AUTHOR("Eric Dumazet"); 885 MODULE_LICENSE("GPL"); 886