1 /* 2 * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing) 3 * 4 * Copyright (C) 2013-2015 Eric Dumazet <edumazet@google.com> 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 * 11 * Meant to be mostly used for locally generated traffic : 12 * Fast classification depends on skb->sk being set before reaching us. 13 * If not, (router workload), we use rxhash as fallback, with 32 bits wide hash. 14 * All packets belonging to a socket are considered as a 'flow'. 15 * 16 * Flows are dynamically allocated and stored in a hash table of RB trees 17 * They are also part of one Round Robin 'queues' (new or old flows) 18 * 19 * Burst avoidance (aka pacing) capability : 20 * 21 * Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a 22 * bunch of packets, and this packet scheduler adds delay between 23 * packets to respect rate limitation. 24 * 25 * enqueue() : 26 * - lookup one RB tree (out of 1024 or more) to find the flow. 27 * If non existent flow, create it, add it to the tree. 28 * Add skb to the per flow list of skb (fifo). 29 * - Use a special fifo for high prio packets 30 * 31 * dequeue() : serves flows in Round Robin 32 * Note : When a flow becomes empty, we do not immediately remove it from 33 * rb trees, for performance reasons (its expected to send additional packets, 34 * or SLAB cache will reuse socket for another flow) 35 */ 36 37 #include <linux/module.h> 38 #include <linux/types.h> 39 #include <linux/kernel.h> 40 #include <linux/jiffies.h> 41 #include <linux/string.h> 42 #include <linux/in.h> 43 #include <linux/errno.h> 44 #include <linux/init.h> 45 #include <linux/skbuff.h> 46 #include <linux/slab.h> 47 #include <linux/rbtree.h> 48 #include <linux/hash.h> 49 #include <linux/prefetch.h> 50 #include <linux/vmalloc.h> 51 #include <net/netlink.h> 52 #include <net/pkt_sched.h> 53 #include <net/sock.h> 54 #include <net/tcp_states.h> 55 #include <net/tcp.h> 56 57 /* 58 * Per flow structure, dynamically allocated 59 */ 60 struct fq_flow { 61 struct sk_buff *head; /* list of skbs for this flow : first skb */ 62 union { 63 struct sk_buff *tail; /* last skb in the list */ 64 unsigned long age; /* jiffies when flow was emptied, for gc */ 65 }; 66 struct rb_node fq_node; /* anchor in fq_root[] trees */ 67 struct sock *sk; 68 int qlen; /* number of packets in flow queue */ 69 int credit; 70 u32 socket_hash; /* sk_hash */ 71 struct fq_flow *next; /* next pointer in RR lists, or &detached */ 72 73 struct rb_node rate_node; /* anchor in q->delayed tree */ 74 u64 time_next_packet; 75 }; 76 77 struct fq_flow_head { 78 struct fq_flow *first; 79 struct fq_flow *last; 80 }; 81 82 struct fq_sched_data { 83 struct fq_flow_head new_flows; 84 85 struct fq_flow_head old_flows; 86 87 struct rb_root delayed; /* for rate limited flows */ 88 u64 time_next_delayed_flow; 89 90 struct fq_flow internal; /* for non classified or high prio packets */ 91 u32 quantum; 92 u32 initial_quantum; 93 u32 flow_refill_delay; 94 u32 flow_max_rate; /* optional max rate per flow */ 95 u32 flow_plimit; /* max packets per flow */ 96 u32 orphan_mask; /* mask for orphaned skb */ 97 struct rb_root *fq_root; 98 u8 rate_enable; 99 u8 fq_trees_log; 100 101 u32 flows; 102 u32 inactive_flows; 103 u32 throttled_flows; 104 105 u64 stat_gc_flows; 106 u64 stat_internal_packets; 107 u64 stat_tcp_retrans; 108 u64 stat_throttled; 109 u64 stat_flows_plimit; 110 u64 stat_pkts_too_long; 111 u64 stat_allocation_errors; 112 struct qdisc_watchdog watchdog; 113 }; 114 115 /* special value to mark a detached flow (not on old/new list) */ 116 static struct fq_flow detached, throttled; 117 118 static void fq_flow_set_detached(struct fq_flow *f) 119 { 120 f->next = &detached; 121 f->age = jiffies; 122 } 123 124 static bool fq_flow_is_detached(const struct fq_flow *f) 125 { 126 return f->next == &detached; 127 } 128 129 static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f) 130 { 131 struct rb_node **p = &q->delayed.rb_node, *parent = NULL; 132 133 while (*p) { 134 struct fq_flow *aux; 135 136 parent = *p; 137 aux = container_of(parent, struct fq_flow, rate_node); 138 if (f->time_next_packet >= aux->time_next_packet) 139 p = &parent->rb_right; 140 else 141 p = &parent->rb_left; 142 } 143 rb_link_node(&f->rate_node, parent, p); 144 rb_insert_color(&f->rate_node, &q->delayed); 145 q->throttled_flows++; 146 q->stat_throttled++; 147 148 f->next = &throttled; 149 if (q->time_next_delayed_flow > f->time_next_packet) 150 q->time_next_delayed_flow = f->time_next_packet; 151 } 152 153 154 static struct kmem_cache *fq_flow_cachep __read_mostly; 155 156 static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow) 157 { 158 if (head->first) 159 head->last->next = flow; 160 else 161 head->first = flow; 162 head->last = flow; 163 flow->next = NULL; 164 } 165 166 /* limit number of collected flows per round */ 167 #define FQ_GC_MAX 8 168 #define FQ_GC_AGE (3*HZ) 169 170 static bool fq_gc_candidate(const struct fq_flow *f) 171 { 172 return fq_flow_is_detached(f) && 173 time_after(jiffies, f->age + FQ_GC_AGE); 174 } 175 176 static void fq_gc(struct fq_sched_data *q, 177 struct rb_root *root, 178 struct sock *sk) 179 { 180 struct fq_flow *f, *tofree[FQ_GC_MAX]; 181 struct rb_node **p, *parent; 182 int fcnt = 0; 183 184 p = &root->rb_node; 185 parent = NULL; 186 while (*p) { 187 parent = *p; 188 189 f = container_of(parent, struct fq_flow, fq_node); 190 if (f->sk == sk) 191 break; 192 193 if (fq_gc_candidate(f)) { 194 tofree[fcnt++] = f; 195 if (fcnt == FQ_GC_MAX) 196 break; 197 } 198 199 if (f->sk > sk) 200 p = &parent->rb_right; 201 else 202 p = &parent->rb_left; 203 } 204 205 q->flows -= fcnt; 206 q->inactive_flows -= fcnt; 207 q->stat_gc_flows += fcnt; 208 while (fcnt) { 209 struct fq_flow *f = tofree[--fcnt]; 210 211 rb_erase(&f->fq_node, root); 212 kmem_cache_free(fq_flow_cachep, f); 213 } 214 } 215 216 static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) 217 { 218 struct rb_node **p, *parent; 219 struct sock *sk = skb->sk; 220 struct rb_root *root; 221 struct fq_flow *f; 222 223 /* warning: no starvation prevention... */ 224 if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) 225 return &q->internal; 226 227 /* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket 228 * or a listener (SYNCOOKIE mode) 229 * 1) request sockets are not full blown, 230 * they do not contain sk_pacing_rate 231 * 2) They are not part of a 'flow' yet 232 * 3) We do not want to rate limit them (eg SYNFLOOD attack), 233 * especially if the listener set SO_MAX_PACING_RATE 234 * 4) We pretend they are orphaned 235 */ 236 if (!sk || sk_listener(sk)) { 237 unsigned long hash = skb_get_hash(skb) & q->orphan_mask; 238 239 /* By forcing low order bit to 1, we make sure to not 240 * collide with a local flow (socket pointers are word aligned) 241 */ 242 sk = (struct sock *)((hash << 1) | 1UL); 243 skb_orphan(skb); 244 } 245 246 root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)]; 247 248 if (q->flows >= (2U << q->fq_trees_log) && 249 q->inactive_flows > q->flows/2) 250 fq_gc(q, root, sk); 251 252 p = &root->rb_node; 253 parent = NULL; 254 while (*p) { 255 parent = *p; 256 257 f = container_of(parent, struct fq_flow, fq_node); 258 if (f->sk == sk) { 259 /* socket might have been reallocated, so check 260 * if its sk_hash is the same. 261 * It not, we need to refill credit with 262 * initial quantum 263 */ 264 if (unlikely(skb->sk && 265 f->socket_hash != sk->sk_hash)) { 266 f->credit = q->initial_quantum; 267 f->socket_hash = sk->sk_hash; 268 f->time_next_packet = 0ULL; 269 } 270 return f; 271 } 272 if (f->sk > sk) 273 p = &parent->rb_right; 274 else 275 p = &parent->rb_left; 276 } 277 278 f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN); 279 if (unlikely(!f)) { 280 q->stat_allocation_errors++; 281 return &q->internal; 282 } 283 fq_flow_set_detached(f); 284 f->sk = sk; 285 if (skb->sk) 286 f->socket_hash = sk->sk_hash; 287 f->credit = q->initial_quantum; 288 289 rb_link_node(&f->fq_node, parent, p); 290 rb_insert_color(&f->fq_node, root); 291 292 q->flows++; 293 q->inactive_flows++; 294 return f; 295 } 296 297 298 /* remove one skb from head of flow queue */ 299 static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow) 300 { 301 struct sk_buff *skb = flow->head; 302 303 if (skb) { 304 flow->head = skb->next; 305 skb->next = NULL; 306 flow->qlen--; 307 qdisc_qstats_backlog_dec(sch, skb); 308 sch->q.qlen--; 309 } 310 return skb; 311 } 312 313 /* We might add in the future detection of retransmits 314 * For the time being, just return false 315 */ 316 static bool skb_is_retransmit(struct sk_buff *skb) 317 { 318 return false; 319 } 320 321 /* add skb to flow queue 322 * flow queue is a linked list, kind of FIFO, except for TCP retransmits 323 * We special case tcp retransmits to be transmitted before other packets. 324 * We rely on fact that TCP retransmits are unlikely, so we do not waste 325 * a separate queue or a pointer. 326 * head-> [retrans pkt 1] 327 * [retrans pkt 2] 328 * [ normal pkt 1] 329 * [ normal pkt 2] 330 * [ normal pkt 3] 331 * tail-> [ normal pkt 4] 332 */ 333 static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) 334 { 335 struct sk_buff *prev, *head = flow->head; 336 337 skb->next = NULL; 338 if (!head) { 339 flow->head = skb; 340 flow->tail = skb; 341 return; 342 } 343 if (likely(!skb_is_retransmit(skb))) { 344 flow->tail->next = skb; 345 flow->tail = skb; 346 return; 347 } 348 349 /* This skb is a tcp retransmit, 350 * find the last retrans packet in the queue 351 */ 352 prev = NULL; 353 while (skb_is_retransmit(head)) { 354 prev = head; 355 head = head->next; 356 if (!head) 357 break; 358 } 359 if (!prev) { /* no rtx packet in queue, become the new head */ 360 skb->next = flow->head; 361 flow->head = skb; 362 } else { 363 if (prev == flow->tail) 364 flow->tail = skb; 365 else 366 skb->next = prev->next; 367 prev->next = skb; 368 } 369 } 370 371 static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch) 372 { 373 struct fq_sched_data *q = qdisc_priv(sch); 374 struct fq_flow *f; 375 376 if (unlikely(sch->q.qlen >= sch->limit)) 377 return qdisc_drop(skb, sch); 378 379 f = fq_classify(skb, q); 380 if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) { 381 q->stat_flows_plimit++; 382 return qdisc_drop(skb, sch); 383 } 384 385 f->qlen++; 386 if (skb_is_retransmit(skb)) 387 q->stat_tcp_retrans++; 388 qdisc_qstats_backlog_inc(sch, skb); 389 if (fq_flow_is_detached(f)) { 390 fq_flow_add_tail(&q->new_flows, f); 391 if (time_after(jiffies, f->age + q->flow_refill_delay)) 392 f->credit = max_t(u32, f->credit, q->quantum); 393 q->inactive_flows--; 394 } 395 396 /* Note: this overwrites f->age */ 397 flow_queue_add(f, skb); 398 399 if (unlikely(f == &q->internal)) { 400 q->stat_internal_packets++; 401 } 402 sch->q.qlen++; 403 404 return NET_XMIT_SUCCESS; 405 } 406 407 static void fq_check_throttled(struct fq_sched_data *q, u64 now) 408 { 409 struct rb_node *p; 410 411 if (q->time_next_delayed_flow > now) 412 return; 413 414 q->time_next_delayed_flow = ~0ULL; 415 while ((p = rb_first(&q->delayed)) != NULL) { 416 struct fq_flow *f = container_of(p, struct fq_flow, rate_node); 417 418 if (f->time_next_packet > now) { 419 q->time_next_delayed_flow = f->time_next_packet; 420 break; 421 } 422 rb_erase(p, &q->delayed); 423 q->throttled_flows--; 424 fq_flow_add_tail(&q->old_flows, f); 425 } 426 } 427 428 static struct sk_buff *fq_dequeue(struct Qdisc *sch) 429 { 430 struct fq_sched_data *q = qdisc_priv(sch); 431 u64 now = ktime_get_ns(); 432 struct fq_flow_head *head; 433 struct sk_buff *skb; 434 struct fq_flow *f; 435 u32 rate; 436 437 skb = fq_dequeue_head(sch, &q->internal); 438 if (skb) 439 goto out; 440 fq_check_throttled(q, now); 441 begin: 442 head = &q->new_flows; 443 if (!head->first) { 444 head = &q->old_flows; 445 if (!head->first) { 446 if (q->time_next_delayed_flow != ~0ULL) 447 qdisc_watchdog_schedule_ns(&q->watchdog, 448 q->time_next_delayed_flow, 449 false); 450 return NULL; 451 } 452 } 453 f = head->first; 454 455 if (f->credit <= 0) { 456 f->credit += q->quantum; 457 head->first = f->next; 458 fq_flow_add_tail(&q->old_flows, f); 459 goto begin; 460 } 461 462 skb = f->head; 463 if (unlikely(skb && now < f->time_next_packet && 464 !skb_is_tcp_pure_ack(skb))) { 465 head->first = f->next; 466 fq_flow_set_throttled(q, f); 467 goto begin; 468 } 469 470 skb = fq_dequeue_head(sch, f); 471 if (!skb) { 472 head->first = f->next; 473 /* force a pass through old_flows to prevent starvation */ 474 if ((head == &q->new_flows) && q->old_flows.first) { 475 fq_flow_add_tail(&q->old_flows, f); 476 } else { 477 fq_flow_set_detached(f); 478 q->inactive_flows++; 479 } 480 goto begin; 481 } 482 prefetch(&skb->end); 483 f->credit -= qdisc_pkt_len(skb); 484 485 if (f->credit > 0 || !q->rate_enable) 486 goto out; 487 488 /* Do not pace locally generated ack packets */ 489 if (skb_is_tcp_pure_ack(skb)) 490 goto out; 491 492 rate = q->flow_max_rate; 493 if (skb->sk) 494 rate = min(skb->sk->sk_pacing_rate, rate); 495 496 if (rate != ~0U) { 497 u32 plen = max(qdisc_pkt_len(skb), q->quantum); 498 u64 len = (u64)plen * NSEC_PER_SEC; 499 500 if (likely(rate)) 501 do_div(len, rate); 502 /* Since socket rate can change later, 503 * clamp the delay to 1 second. 504 * Really, providers of too big packets should be fixed ! 505 */ 506 if (unlikely(len > NSEC_PER_SEC)) { 507 len = NSEC_PER_SEC; 508 q->stat_pkts_too_long++; 509 } 510 511 f->time_next_packet = now + len; 512 } 513 out: 514 qdisc_bstats_update(sch, skb); 515 return skb; 516 } 517 518 static void fq_reset(struct Qdisc *sch) 519 { 520 struct fq_sched_data *q = qdisc_priv(sch); 521 struct rb_root *root; 522 struct sk_buff *skb; 523 struct rb_node *p; 524 struct fq_flow *f; 525 unsigned int idx; 526 527 while ((skb = fq_dequeue_head(sch, &q->internal)) != NULL) 528 kfree_skb(skb); 529 530 if (!q->fq_root) 531 return; 532 533 for (idx = 0; idx < (1U << q->fq_trees_log); idx++) { 534 root = &q->fq_root[idx]; 535 while ((p = rb_first(root)) != NULL) { 536 f = container_of(p, struct fq_flow, fq_node); 537 rb_erase(p, root); 538 539 while ((skb = fq_dequeue_head(sch, f)) != NULL) 540 kfree_skb(skb); 541 542 kmem_cache_free(fq_flow_cachep, f); 543 } 544 } 545 q->new_flows.first = NULL; 546 q->old_flows.first = NULL; 547 q->delayed = RB_ROOT; 548 q->flows = 0; 549 q->inactive_flows = 0; 550 q->throttled_flows = 0; 551 } 552 553 static void fq_rehash(struct fq_sched_data *q, 554 struct rb_root *old_array, u32 old_log, 555 struct rb_root *new_array, u32 new_log) 556 { 557 struct rb_node *op, **np, *parent; 558 struct rb_root *oroot, *nroot; 559 struct fq_flow *of, *nf; 560 int fcnt = 0; 561 u32 idx; 562 563 for (idx = 0; idx < (1U << old_log); idx++) { 564 oroot = &old_array[idx]; 565 while ((op = rb_first(oroot)) != NULL) { 566 rb_erase(op, oroot); 567 of = container_of(op, struct fq_flow, fq_node); 568 if (fq_gc_candidate(of)) { 569 fcnt++; 570 kmem_cache_free(fq_flow_cachep, of); 571 continue; 572 } 573 nroot = &new_array[hash_32((u32)(long)of->sk, new_log)]; 574 575 np = &nroot->rb_node; 576 parent = NULL; 577 while (*np) { 578 parent = *np; 579 580 nf = container_of(parent, struct fq_flow, fq_node); 581 BUG_ON(nf->sk == of->sk); 582 583 if (nf->sk > of->sk) 584 np = &parent->rb_right; 585 else 586 np = &parent->rb_left; 587 } 588 589 rb_link_node(&of->fq_node, parent, np); 590 rb_insert_color(&of->fq_node, nroot); 591 } 592 } 593 q->flows -= fcnt; 594 q->inactive_flows -= fcnt; 595 q->stat_gc_flows += fcnt; 596 } 597 598 static void *fq_alloc_node(size_t sz, int node) 599 { 600 void *ptr; 601 602 ptr = kmalloc_node(sz, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN, node); 603 if (!ptr) 604 ptr = vmalloc_node(sz, node); 605 return ptr; 606 } 607 608 static void fq_free(void *addr) 609 { 610 kvfree(addr); 611 } 612 613 static int fq_resize(struct Qdisc *sch, u32 log) 614 { 615 struct fq_sched_data *q = qdisc_priv(sch); 616 struct rb_root *array; 617 void *old_fq_root; 618 u32 idx; 619 620 if (q->fq_root && log == q->fq_trees_log) 621 return 0; 622 623 /* If XPS was setup, we can allocate memory on right NUMA node */ 624 array = fq_alloc_node(sizeof(struct rb_root) << log, 625 netdev_queue_numa_node_read(sch->dev_queue)); 626 if (!array) 627 return -ENOMEM; 628 629 for (idx = 0; idx < (1U << log); idx++) 630 array[idx] = RB_ROOT; 631 632 sch_tree_lock(sch); 633 634 old_fq_root = q->fq_root; 635 if (old_fq_root) 636 fq_rehash(q, old_fq_root, q->fq_trees_log, array, log); 637 638 q->fq_root = array; 639 q->fq_trees_log = log; 640 641 sch_tree_unlock(sch); 642 643 fq_free(old_fq_root); 644 645 return 0; 646 } 647 648 static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { 649 [TCA_FQ_PLIMIT] = { .type = NLA_U32 }, 650 [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 }, 651 [TCA_FQ_QUANTUM] = { .type = NLA_U32 }, 652 [TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 }, 653 [TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 }, 654 [TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 }, 655 [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 }, 656 [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 }, 657 [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 }, 658 }; 659 660 static int fq_change(struct Qdisc *sch, struct nlattr *opt) 661 { 662 struct fq_sched_data *q = qdisc_priv(sch); 663 struct nlattr *tb[TCA_FQ_MAX + 1]; 664 int err, drop_count = 0; 665 unsigned drop_len = 0; 666 u32 fq_log; 667 668 if (!opt) 669 return -EINVAL; 670 671 err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy); 672 if (err < 0) 673 return err; 674 675 sch_tree_lock(sch); 676 677 fq_log = q->fq_trees_log; 678 679 if (tb[TCA_FQ_BUCKETS_LOG]) { 680 u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]); 681 682 if (nval >= 1 && nval <= ilog2(256*1024)) 683 fq_log = nval; 684 else 685 err = -EINVAL; 686 } 687 if (tb[TCA_FQ_PLIMIT]) 688 sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]); 689 690 if (tb[TCA_FQ_FLOW_PLIMIT]) 691 q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]); 692 693 if (tb[TCA_FQ_QUANTUM]) { 694 u32 quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]); 695 696 if (quantum > 0) 697 q->quantum = quantum; 698 else 699 err = -EINVAL; 700 } 701 702 if (tb[TCA_FQ_INITIAL_QUANTUM]) 703 q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]); 704 705 if (tb[TCA_FQ_FLOW_DEFAULT_RATE]) 706 pr_warn_ratelimited("sch_fq: defrate %u ignored.\n", 707 nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE])); 708 709 if (tb[TCA_FQ_FLOW_MAX_RATE]) 710 q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); 711 712 if (tb[TCA_FQ_RATE_ENABLE]) { 713 u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]); 714 715 if (enable <= 1) 716 q->rate_enable = enable; 717 else 718 err = -EINVAL; 719 } 720 721 if (tb[TCA_FQ_FLOW_REFILL_DELAY]) { 722 u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ; 723 724 q->flow_refill_delay = usecs_to_jiffies(usecs_delay); 725 } 726 727 if (tb[TCA_FQ_ORPHAN_MASK]) 728 q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]); 729 730 if (!err) { 731 sch_tree_unlock(sch); 732 err = fq_resize(sch, fq_log); 733 sch_tree_lock(sch); 734 } 735 while (sch->q.qlen > sch->limit) { 736 struct sk_buff *skb = fq_dequeue(sch); 737 738 if (!skb) 739 break; 740 drop_len += qdisc_pkt_len(skb); 741 kfree_skb(skb); 742 drop_count++; 743 } 744 qdisc_tree_reduce_backlog(sch, drop_count, drop_len); 745 746 sch_tree_unlock(sch); 747 return err; 748 } 749 750 static void fq_destroy(struct Qdisc *sch) 751 { 752 struct fq_sched_data *q = qdisc_priv(sch); 753 754 fq_reset(sch); 755 fq_free(q->fq_root); 756 qdisc_watchdog_cancel(&q->watchdog); 757 } 758 759 static int fq_init(struct Qdisc *sch, struct nlattr *opt) 760 { 761 struct fq_sched_data *q = qdisc_priv(sch); 762 int err; 763 764 sch->limit = 10000; 765 q->flow_plimit = 100; 766 q->quantum = 2 * psched_mtu(qdisc_dev(sch)); 767 q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch)); 768 q->flow_refill_delay = msecs_to_jiffies(40); 769 q->flow_max_rate = ~0U; 770 q->rate_enable = 1; 771 q->new_flows.first = NULL; 772 q->old_flows.first = NULL; 773 q->delayed = RB_ROOT; 774 q->fq_root = NULL; 775 q->fq_trees_log = ilog2(1024); 776 q->orphan_mask = 1024 - 1; 777 qdisc_watchdog_init(&q->watchdog, sch); 778 779 if (opt) 780 err = fq_change(sch, opt); 781 else 782 err = fq_resize(sch, q->fq_trees_log); 783 784 return err; 785 } 786 787 static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) 788 { 789 struct fq_sched_data *q = qdisc_priv(sch); 790 struct nlattr *opts; 791 792 opts = nla_nest_start(skb, TCA_OPTIONS); 793 if (opts == NULL) 794 goto nla_put_failure; 795 796 /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */ 797 798 if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) || 799 nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) || 800 nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) || 801 nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) || 802 nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) || 803 nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) || 804 nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY, 805 jiffies_to_usecs(q->flow_refill_delay)) || 806 nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) || 807 nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log)) 808 goto nla_put_failure; 809 810 return nla_nest_end(skb, opts); 811 812 nla_put_failure: 813 return -1; 814 } 815 816 static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) 817 { 818 struct fq_sched_data *q = qdisc_priv(sch); 819 u64 now = ktime_get_ns(); 820 struct tc_fq_qd_stats st = { 821 .gc_flows = q->stat_gc_flows, 822 .highprio_packets = q->stat_internal_packets, 823 .tcp_retrans = q->stat_tcp_retrans, 824 .throttled = q->stat_throttled, 825 .flows_plimit = q->stat_flows_plimit, 826 .pkts_too_long = q->stat_pkts_too_long, 827 .allocation_errors = q->stat_allocation_errors, 828 .flows = q->flows, 829 .inactive_flows = q->inactive_flows, 830 .throttled_flows = q->throttled_flows, 831 .time_next_delayed_flow = q->time_next_delayed_flow - now, 832 }; 833 834 return gnet_stats_copy_app(d, &st, sizeof(st)); 835 } 836 837 static struct Qdisc_ops fq_qdisc_ops __read_mostly = { 838 .id = "fq", 839 .priv_size = sizeof(struct fq_sched_data), 840 841 .enqueue = fq_enqueue, 842 .dequeue = fq_dequeue, 843 .peek = qdisc_peek_dequeued, 844 .init = fq_init, 845 .reset = fq_reset, 846 .destroy = fq_destroy, 847 .change = fq_change, 848 .dump = fq_dump, 849 .dump_stats = fq_dump_stats, 850 .owner = THIS_MODULE, 851 }; 852 853 static int __init fq_module_init(void) 854 { 855 int ret; 856 857 fq_flow_cachep = kmem_cache_create("fq_flow_cache", 858 sizeof(struct fq_flow), 859 0, 0, NULL); 860 if (!fq_flow_cachep) 861 return -ENOMEM; 862 863 ret = register_qdisc(&fq_qdisc_ops); 864 if (ret) 865 kmem_cache_destroy(fq_flow_cachep); 866 return ret; 867 } 868 869 static void __exit fq_module_exit(void) 870 { 871 unregister_qdisc(&fq_qdisc_ops); 872 kmem_cache_destroy(fq_flow_cachep); 873 } 874 875 module_init(fq_module_init) 876 module_exit(fq_module_exit) 877 MODULE_AUTHOR("Eric Dumazet"); 878 MODULE_LICENSE("GPL"); 879