1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing) 4 * 5 * Copyright (C) 2013-2015 Eric Dumazet <edumazet@google.com> 6 * 7 * Meant to be mostly used for locally generated traffic : 8 * Fast classification depends on skb->sk being set before reaching us. 9 * If not, (router workload), we use rxhash as fallback, with 32 bits wide hash. 10 * All packets belonging to a socket are considered as a 'flow'. 11 * 12 * Flows are dynamically allocated and stored in a hash table of RB trees 13 * They are also part of one Round Robin 'queues' (new or old flows) 14 * 15 * Burst avoidance (aka pacing) capability : 16 * 17 * Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a 18 * bunch of packets, and this packet scheduler adds delay between 19 * packets to respect rate limitation. 20 * 21 * enqueue() : 22 * - lookup one RB tree (out of 1024 or more) to find the flow. 23 * If non existent flow, create it, add it to the tree. 24 * Add skb to the per flow list of skb (fifo). 25 * - Use a special fifo for high prio packets 26 * 27 * dequeue() : serves flows in Round Robin 28 * Note : When a flow becomes empty, we do not immediately remove it from 29 * rb trees, for performance reasons (its expected to send additional packets, 30 * or SLAB cache will reuse socket for another flow) 31 */ 32 33 #include <linux/module.h> 34 #include <linux/types.h> 35 #include <linux/kernel.h> 36 #include <linux/jiffies.h> 37 #include <linux/string.h> 38 #include <linux/in.h> 39 #include <linux/errno.h> 40 #include <linux/init.h> 41 #include <linux/skbuff.h> 42 #include <linux/slab.h> 43 #include <linux/rbtree.h> 44 #include <linux/hash.h> 45 #include <linux/prefetch.h> 46 #include <linux/vmalloc.h> 47 #include <net/netlink.h> 48 #include <net/pkt_sched.h> 49 #include <net/sock.h> 50 #include <net/tcp_states.h> 51 #include <net/tcp.h> 52 53 struct fq_skb_cb { 54 u64 time_to_send; 55 }; 56 57 static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb) 58 { 59 qdisc_cb_private_validate(skb, sizeof(struct fq_skb_cb)); 60 return (struct fq_skb_cb *)qdisc_skb_cb(skb)->data; 61 } 62 63 /* 64 * Per flow structure, dynamically allocated. 65 * If packets have monotically increasing time_to_send, they are placed in O(1) 66 * in linear list (head,tail), otherwise are placed in a rbtree (t_root). 67 */ 68 struct fq_flow { 69 /* First cache line : used in fq_gc(), fq_enqueue(), fq_dequeue() */ 70 struct rb_root t_root; 71 struct sk_buff *head; /* list of skbs for this flow : first skb */ 72 union { 73 struct sk_buff *tail; /* last skb in the list */ 74 unsigned long age; /* (jiffies | 1UL) when flow was emptied, for gc */ 75 }; 76 struct rb_node fq_node; /* anchor in fq_root[] trees */ 77 struct sock *sk; 78 u32 socket_hash; /* sk_hash */ 79 int qlen; /* number of packets in flow queue */ 80 81 /* Second cache line, used in fq_dequeue() */ 82 int credit; 83 /* 32bit hole on 64bit arches */ 84 85 struct fq_flow *next; /* next pointer in RR lists */ 86 87 struct rb_node rate_node; /* anchor in q->delayed tree */ 88 u64 time_next_packet; 89 } ____cacheline_aligned_in_smp; 90 91 struct fq_flow_head { 92 struct fq_flow *first; 93 struct fq_flow *last; 94 }; 95 96 struct fq_sched_data { 97 struct fq_flow_head new_flows; 98 99 struct fq_flow_head old_flows; 100 101 struct rb_root delayed; /* for rate limited flows */ 102 u64 time_next_delayed_flow; 103 unsigned long unthrottle_latency_ns; 104 105 struct fq_flow internal; /* for non classified or high prio packets */ 106 u32 quantum; 107 u32 initial_quantum; 108 u32 flow_refill_delay; 109 u32 flow_plimit; /* max packets per flow */ 110 unsigned long flow_max_rate; /* optional max rate per flow */ 111 u64 ce_threshold; 112 u32 orphan_mask; /* mask for orphaned skb */ 113 u32 low_rate_threshold; 114 struct rb_root *fq_root; 115 u8 rate_enable; 116 u8 fq_trees_log; 117 118 u32 flows; 119 u32 inactive_flows; 120 u32 throttled_flows; 121 122 u64 stat_gc_flows; 123 u64 stat_internal_packets; 124 u64 stat_throttled; 125 u64 stat_ce_mark; 126 u64 stat_flows_plimit; 127 u64 stat_pkts_too_long; 128 u64 stat_allocation_errors; 129 130 u32 timer_slack; /* hrtimer slack in ns */ 131 struct qdisc_watchdog watchdog; 132 }; 133 134 /* 135 * f->tail and f->age share the same location. 136 * We can use the low order bit to differentiate if this location points 137 * to a sk_buff or contains a jiffies value, if we force this value to be odd. 138 * This assumes f->tail low order bit must be 0 since alignof(struct sk_buff) >= 2 139 */ 140 static void fq_flow_set_detached(struct fq_flow *f) 141 { 142 f->age = jiffies | 1UL; 143 } 144 145 static bool fq_flow_is_detached(const struct fq_flow *f) 146 { 147 return !!(f->age & 1UL); 148 } 149 150 /* special value to mark a throttled flow (not on old/new list) */ 151 static struct fq_flow throttled; 152 153 static bool fq_flow_is_throttled(const struct fq_flow *f) 154 { 155 return f->next == &throttled; 156 } 157 158 static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow) 159 { 160 if (head->first) 161 head->last->next = flow; 162 else 163 head->first = flow; 164 head->last = flow; 165 flow->next = NULL; 166 } 167 168 static void fq_flow_unset_throttled(struct fq_sched_data *q, struct fq_flow *f) 169 { 170 rb_erase(&f->rate_node, &q->delayed); 171 q->throttled_flows--; 172 fq_flow_add_tail(&q->old_flows, f); 173 } 174 175 static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f) 176 { 177 struct rb_node **p = &q->delayed.rb_node, *parent = NULL; 178 179 while (*p) { 180 struct fq_flow *aux; 181 182 parent = *p; 183 aux = rb_entry(parent, struct fq_flow, rate_node); 184 if (f->time_next_packet >= aux->time_next_packet) 185 p = &parent->rb_right; 186 else 187 p = &parent->rb_left; 188 } 189 rb_link_node(&f->rate_node, parent, p); 190 rb_insert_color(&f->rate_node, &q->delayed); 191 q->throttled_flows++; 192 q->stat_throttled++; 193 194 f->next = &throttled; 195 if (q->time_next_delayed_flow > f->time_next_packet) 196 q->time_next_delayed_flow = f->time_next_packet; 197 } 198 199 200 static struct kmem_cache *fq_flow_cachep __read_mostly; 201 202 203 /* limit number of collected flows per round */ 204 #define FQ_GC_MAX 8 205 #define FQ_GC_AGE (3*HZ) 206 207 static bool fq_gc_candidate(const struct fq_flow *f) 208 { 209 return fq_flow_is_detached(f) && 210 time_after(jiffies, f->age + FQ_GC_AGE); 211 } 212 213 static void fq_gc(struct fq_sched_data *q, 214 struct rb_root *root, 215 struct sock *sk) 216 { 217 struct rb_node **p, *parent; 218 void *tofree[FQ_GC_MAX]; 219 struct fq_flow *f; 220 int i, fcnt = 0; 221 222 p = &root->rb_node; 223 parent = NULL; 224 while (*p) { 225 parent = *p; 226 227 f = rb_entry(parent, struct fq_flow, fq_node); 228 if (f->sk == sk) 229 break; 230 231 if (fq_gc_candidate(f)) { 232 tofree[fcnt++] = f; 233 if (fcnt == FQ_GC_MAX) 234 break; 235 } 236 237 if (f->sk > sk) 238 p = &parent->rb_right; 239 else 240 p = &parent->rb_left; 241 } 242 243 if (!fcnt) 244 return; 245 246 for (i = fcnt; i > 0; ) { 247 f = tofree[--i]; 248 rb_erase(&f->fq_node, root); 249 } 250 q->flows -= fcnt; 251 q->inactive_flows -= fcnt; 252 q->stat_gc_flows += fcnt; 253 254 kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree); 255 } 256 257 static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) 258 { 259 struct rb_node **p, *parent; 260 struct sock *sk = skb->sk; 261 struct rb_root *root; 262 struct fq_flow *f; 263 264 /* warning: no starvation prevention... */ 265 if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) 266 return &q->internal; 267 268 /* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket 269 * or a listener (SYNCOOKIE mode) 270 * 1) request sockets are not full blown, 271 * they do not contain sk_pacing_rate 272 * 2) They are not part of a 'flow' yet 273 * 3) We do not want to rate limit them (eg SYNFLOOD attack), 274 * especially if the listener set SO_MAX_PACING_RATE 275 * 4) We pretend they are orphaned 276 */ 277 if (!sk || sk_listener(sk)) { 278 unsigned long hash = skb_get_hash(skb) & q->orphan_mask; 279 280 /* By forcing low order bit to 1, we make sure to not 281 * collide with a local flow (socket pointers are word aligned) 282 */ 283 sk = (struct sock *)((hash << 1) | 1UL); 284 skb_orphan(skb); 285 } else if (sk->sk_state == TCP_CLOSE) { 286 unsigned long hash = skb_get_hash(skb) & q->orphan_mask; 287 /* 288 * Sockets in TCP_CLOSE are non connected. 289 * Typical use case is UDP sockets, they can send packets 290 * with sendto() to many different destinations. 291 * We probably could use a generic bit advertising 292 * non connected sockets, instead of sk_state == TCP_CLOSE, 293 * if we care enough. 294 */ 295 sk = (struct sock *)((hash << 1) | 1UL); 296 } 297 298 root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)]; 299 300 if (q->flows >= (2U << q->fq_trees_log) && 301 q->inactive_flows > q->flows/2) 302 fq_gc(q, root, sk); 303 304 p = &root->rb_node; 305 parent = NULL; 306 while (*p) { 307 parent = *p; 308 309 f = rb_entry(parent, struct fq_flow, fq_node); 310 if (f->sk == sk) { 311 /* socket might have been reallocated, so check 312 * if its sk_hash is the same. 313 * It not, we need to refill credit with 314 * initial quantum 315 */ 316 if (unlikely(skb->sk == sk && 317 f->socket_hash != sk->sk_hash)) { 318 f->credit = q->initial_quantum; 319 f->socket_hash = sk->sk_hash; 320 if (q->rate_enable) 321 smp_store_release(&sk->sk_pacing_status, 322 SK_PACING_FQ); 323 if (fq_flow_is_throttled(f)) 324 fq_flow_unset_throttled(q, f); 325 f->time_next_packet = 0ULL; 326 } 327 return f; 328 } 329 if (f->sk > sk) 330 p = &parent->rb_right; 331 else 332 p = &parent->rb_left; 333 } 334 335 f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN); 336 if (unlikely(!f)) { 337 q->stat_allocation_errors++; 338 return &q->internal; 339 } 340 /* f->t_root is already zeroed after kmem_cache_zalloc() */ 341 342 fq_flow_set_detached(f); 343 f->sk = sk; 344 if (skb->sk == sk) { 345 f->socket_hash = sk->sk_hash; 346 if (q->rate_enable) 347 smp_store_release(&sk->sk_pacing_status, 348 SK_PACING_FQ); 349 } 350 f->credit = q->initial_quantum; 351 352 rb_link_node(&f->fq_node, parent, p); 353 rb_insert_color(&f->fq_node, root); 354 355 q->flows++; 356 q->inactive_flows++; 357 return f; 358 } 359 360 static struct sk_buff *fq_peek(struct fq_flow *flow) 361 { 362 struct sk_buff *skb = skb_rb_first(&flow->t_root); 363 struct sk_buff *head = flow->head; 364 365 if (!skb) 366 return head; 367 368 if (!head) 369 return skb; 370 371 if (fq_skb_cb(skb)->time_to_send < fq_skb_cb(head)->time_to_send) 372 return skb; 373 return head; 374 } 375 376 static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow, 377 struct sk_buff *skb) 378 { 379 if (skb == flow->head) { 380 flow->head = skb->next; 381 } else { 382 rb_erase(&skb->rbnode, &flow->t_root); 383 skb->dev = qdisc_dev(sch); 384 } 385 } 386 387 /* Remove one skb from flow queue. 388 * This skb must be the return value of prior fq_peek(). 389 */ 390 static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow, 391 struct sk_buff *skb) 392 { 393 fq_erase_head(sch, flow, skb); 394 skb_mark_not_on_list(skb); 395 flow->qlen--; 396 qdisc_qstats_backlog_dec(sch, skb); 397 sch->q.qlen--; 398 } 399 400 static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) 401 { 402 struct rb_node **p, *parent; 403 struct sk_buff *head, *aux; 404 405 fq_skb_cb(skb)->time_to_send = skb->tstamp ?: ktime_get_ns(); 406 407 head = flow->head; 408 if (!head || 409 fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) { 410 if (!head) 411 flow->head = skb; 412 else 413 flow->tail->next = skb; 414 flow->tail = skb; 415 skb->next = NULL; 416 return; 417 } 418 419 p = &flow->t_root.rb_node; 420 parent = NULL; 421 422 while (*p) { 423 parent = *p; 424 aux = rb_to_skb(parent); 425 if (fq_skb_cb(skb)->time_to_send >= fq_skb_cb(aux)->time_to_send) 426 p = &parent->rb_right; 427 else 428 p = &parent->rb_left; 429 } 430 rb_link_node(&skb->rbnode, parent, p); 431 rb_insert_color(&skb->rbnode, &flow->t_root); 432 } 433 434 static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, 435 struct sk_buff **to_free) 436 { 437 struct fq_sched_data *q = qdisc_priv(sch); 438 struct fq_flow *f; 439 440 if (unlikely(sch->q.qlen >= sch->limit)) 441 return qdisc_drop(skb, sch, to_free); 442 443 f = fq_classify(skb, q); 444 if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) { 445 q->stat_flows_plimit++; 446 return qdisc_drop(skb, sch, to_free); 447 } 448 449 f->qlen++; 450 qdisc_qstats_backlog_inc(sch, skb); 451 if (fq_flow_is_detached(f)) { 452 fq_flow_add_tail(&q->new_flows, f); 453 if (time_after(jiffies, f->age + q->flow_refill_delay)) 454 f->credit = max_t(u32, f->credit, q->quantum); 455 q->inactive_flows--; 456 } 457 458 /* Note: this overwrites f->age */ 459 flow_queue_add(f, skb); 460 461 if (unlikely(f == &q->internal)) { 462 q->stat_internal_packets++; 463 } 464 sch->q.qlen++; 465 466 return NET_XMIT_SUCCESS; 467 } 468 469 static void fq_check_throttled(struct fq_sched_data *q, u64 now) 470 { 471 unsigned long sample; 472 struct rb_node *p; 473 474 if (q->time_next_delayed_flow > now) 475 return; 476 477 /* Update unthrottle latency EWMA. 478 * This is cheap and can help diagnosing timer/latency problems. 479 */ 480 sample = (unsigned long)(now - q->time_next_delayed_flow); 481 q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3; 482 q->unthrottle_latency_ns += sample >> 3; 483 484 q->time_next_delayed_flow = ~0ULL; 485 while ((p = rb_first(&q->delayed)) != NULL) { 486 struct fq_flow *f = rb_entry(p, struct fq_flow, rate_node); 487 488 if (f->time_next_packet > now) { 489 q->time_next_delayed_flow = f->time_next_packet; 490 break; 491 } 492 fq_flow_unset_throttled(q, f); 493 } 494 } 495 496 static struct sk_buff *fq_dequeue(struct Qdisc *sch) 497 { 498 struct fq_sched_data *q = qdisc_priv(sch); 499 struct fq_flow_head *head; 500 struct sk_buff *skb; 501 struct fq_flow *f; 502 unsigned long rate; 503 u32 plen; 504 u64 now; 505 506 if (!sch->q.qlen) 507 return NULL; 508 509 skb = fq_peek(&q->internal); 510 if (unlikely(skb)) { 511 fq_dequeue_skb(sch, &q->internal, skb); 512 goto out; 513 } 514 515 now = ktime_get_ns(); 516 fq_check_throttled(q, now); 517 begin: 518 head = &q->new_flows; 519 if (!head->first) { 520 head = &q->old_flows; 521 if (!head->first) { 522 if (q->time_next_delayed_flow != ~0ULL) 523 qdisc_watchdog_schedule_range_ns(&q->watchdog, 524 q->time_next_delayed_flow, 525 q->timer_slack); 526 return NULL; 527 } 528 } 529 f = head->first; 530 531 if (f->credit <= 0) { 532 f->credit += q->quantum; 533 head->first = f->next; 534 fq_flow_add_tail(&q->old_flows, f); 535 goto begin; 536 } 537 538 skb = fq_peek(f); 539 if (skb) { 540 u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send, 541 f->time_next_packet); 542 543 if (now < time_next_packet) { 544 head->first = f->next; 545 f->time_next_packet = time_next_packet; 546 fq_flow_set_throttled(q, f); 547 goto begin; 548 } 549 prefetch(&skb->end); 550 if ((s64)(now - time_next_packet - q->ce_threshold) > 0) { 551 INET_ECN_set_ce(skb); 552 q->stat_ce_mark++; 553 } 554 fq_dequeue_skb(sch, f, skb); 555 } else { 556 head->first = f->next; 557 /* force a pass through old_flows to prevent starvation */ 558 if ((head == &q->new_flows) && q->old_flows.first) { 559 fq_flow_add_tail(&q->old_flows, f); 560 } else { 561 fq_flow_set_detached(f); 562 q->inactive_flows++; 563 } 564 goto begin; 565 } 566 plen = qdisc_pkt_len(skb); 567 f->credit -= plen; 568 569 if (!q->rate_enable) 570 goto out; 571 572 rate = q->flow_max_rate; 573 574 /* If EDT time was provided for this skb, we need to 575 * update f->time_next_packet only if this qdisc enforces 576 * a flow max rate. 577 */ 578 if (!skb->tstamp) { 579 if (skb->sk) 580 rate = min(skb->sk->sk_pacing_rate, rate); 581 582 if (rate <= q->low_rate_threshold) { 583 f->credit = 0; 584 } else { 585 plen = max(plen, q->quantum); 586 if (f->credit > 0) 587 goto out; 588 } 589 } 590 if (rate != ~0UL) { 591 u64 len = (u64)plen * NSEC_PER_SEC; 592 593 if (likely(rate)) 594 len = div64_ul(len, rate); 595 /* Since socket rate can change later, 596 * clamp the delay to 1 second. 597 * Really, providers of too big packets should be fixed ! 598 */ 599 if (unlikely(len > NSEC_PER_SEC)) { 600 len = NSEC_PER_SEC; 601 q->stat_pkts_too_long++; 602 } 603 /* Account for schedule/timers drifts. 604 * f->time_next_packet was set when prior packet was sent, 605 * and current time (@now) can be too late by tens of us. 606 */ 607 if (f->time_next_packet) 608 len -= min(len/2, now - f->time_next_packet); 609 f->time_next_packet = now + len; 610 } 611 out: 612 qdisc_bstats_update(sch, skb); 613 return skb; 614 } 615 616 static void fq_flow_purge(struct fq_flow *flow) 617 { 618 struct rb_node *p = rb_first(&flow->t_root); 619 620 while (p) { 621 struct sk_buff *skb = rb_to_skb(p); 622 623 p = rb_next(p); 624 rb_erase(&skb->rbnode, &flow->t_root); 625 rtnl_kfree_skbs(skb, skb); 626 } 627 rtnl_kfree_skbs(flow->head, flow->tail); 628 flow->head = NULL; 629 flow->qlen = 0; 630 } 631 632 static void fq_reset(struct Qdisc *sch) 633 { 634 struct fq_sched_data *q = qdisc_priv(sch); 635 struct rb_root *root; 636 struct rb_node *p; 637 struct fq_flow *f; 638 unsigned int idx; 639 640 sch->q.qlen = 0; 641 sch->qstats.backlog = 0; 642 643 fq_flow_purge(&q->internal); 644 645 if (!q->fq_root) 646 return; 647 648 for (idx = 0; idx < (1U << q->fq_trees_log); idx++) { 649 root = &q->fq_root[idx]; 650 while ((p = rb_first(root)) != NULL) { 651 f = rb_entry(p, struct fq_flow, fq_node); 652 rb_erase(p, root); 653 654 fq_flow_purge(f); 655 656 kmem_cache_free(fq_flow_cachep, f); 657 } 658 } 659 q->new_flows.first = NULL; 660 q->old_flows.first = NULL; 661 q->delayed = RB_ROOT; 662 q->flows = 0; 663 q->inactive_flows = 0; 664 q->throttled_flows = 0; 665 } 666 667 static void fq_rehash(struct fq_sched_data *q, 668 struct rb_root *old_array, u32 old_log, 669 struct rb_root *new_array, u32 new_log) 670 { 671 struct rb_node *op, **np, *parent; 672 struct rb_root *oroot, *nroot; 673 struct fq_flow *of, *nf; 674 int fcnt = 0; 675 u32 idx; 676 677 for (idx = 0; idx < (1U << old_log); idx++) { 678 oroot = &old_array[idx]; 679 while ((op = rb_first(oroot)) != NULL) { 680 rb_erase(op, oroot); 681 of = rb_entry(op, struct fq_flow, fq_node); 682 if (fq_gc_candidate(of)) { 683 fcnt++; 684 kmem_cache_free(fq_flow_cachep, of); 685 continue; 686 } 687 nroot = &new_array[hash_ptr(of->sk, new_log)]; 688 689 np = &nroot->rb_node; 690 parent = NULL; 691 while (*np) { 692 parent = *np; 693 694 nf = rb_entry(parent, struct fq_flow, fq_node); 695 BUG_ON(nf->sk == of->sk); 696 697 if (nf->sk > of->sk) 698 np = &parent->rb_right; 699 else 700 np = &parent->rb_left; 701 } 702 703 rb_link_node(&of->fq_node, parent, np); 704 rb_insert_color(&of->fq_node, nroot); 705 } 706 } 707 q->flows -= fcnt; 708 q->inactive_flows -= fcnt; 709 q->stat_gc_flows += fcnt; 710 } 711 712 static void fq_free(void *addr) 713 { 714 kvfree(addr); 715 } 716 717 static int fq_resize(struct Qdisc *sch, u32 log) 718 { 719 struct fq_sched_data *q = qdisc_priv(sch); 720 struct rb_root *array; 721 void *old_fq_root; 722 u32 idx; 723 724 if (q->fq_root && log == q->fq_trees_log) 725 return 0; 726 727 /* If XPS was setup, we can allocate memory on right NUMA node */ 728 array = kvmalloc_node(sizeof(struct rb_root) << log, GFP_KERNEL | __GFP_RETRY_MAYFAIL, 729 netdev_queue_numa_node_read(sch->dev_queue)); 730 if (!array) 731 return -ENOMEM; 732 733 for (idx = 0; idx < (1U << log); idx++) 734 array[idx] = RB_ROOT; 735 736 sch_tree_lock(sch); 737 738 old_fq_root = q->fq_root; 739 if (old_fq_root) 740 fq_rehash(q, old_fq_root, q->fq_trees_log, array, log); 741 742 q->fq_root = array; 743 q->fq_trees_log = log; 744 745 sch_tree_unlock(sch); 746 747 fq_free(old_fq_root); 748 749 return 0; 750 } 751 752 static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { 753 [TCA_FQ_UNSPEC] = { .strict_start_type = TCA_FQ_TIMER_SLACK }, 754 755 [TCA_FQ_PLIMIT] = { .type = NLA_U32 }, 756 [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 }, 757 [TCA_FQ_QUANTUM] = { .type = NLA_U32 }, 758 [TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 }, 759 [TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 }, 760 [TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 }, 761 [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 }, 762 [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 }, 763 [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 }, 764 [TCA_FQ_ORPHAN_MASK] = { .type = NLA_U32 }, 765 [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 }, 766 [TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 }, 767 [TCA_FQ_TIMER_SLACK] = { .type = NLA_U32 }, 768 }; 769 770 static int fq_change(struct Qdisc *sch, struct nlattr *opt, 771 struct netlink_ext_ack *extack) 772 { 773 struct fq_sched_data *q = qdisc_priv(sch); 774 struct nlattr *tb[TCA_FQ_MAX + 1]; 775 int err, drop_count = 0; 776 unsigned drop_len = 0; 777 u32 fq_log; 778 779 if (!opt) 780 return -EINVAL; 781 782 err = nla_parse_nested_deprecated(tb, TCA_FQ_MAX, opt, fq_policy, 783 NULL); 784 if (err < 0) 785 return err; 786 787 sch_tree_lock(sch); 788 789 fq_log = q->fq_trees_log; 790 791 if (tb[TCA_FQ_BUCKETS_LOG]) { 792 u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]); 793 794 if (nval >= 1 && nval <= ilog2(256*1024)) 795 fq_log = nval; 796 else 797 err = -EINVAL; 798 } 799 if (tb[TCA_FQ_PLIMIT]) 800 sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]); 801 802 if (tb[TCA_FQ_FLOW_PLIMIT]) 803 q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]); 804 805 if (tb[TCA_FQ_QUANTUM]) { 806 u32 quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]); 807 808 if (quantum > 0 && quantum <= (1 << 20)) { 809 q->quantum = quantum; 810 } else { 811 NL_SET_ERR_MSG_MOD(extack, "invalid quantum"); 812 err = -EINVAL; 813 } 814 } 815 816 if (tb[TCA_FQ_INITIAL_QUANTUM]) 817 q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]); 818 819 if (tb[TCA_FQ_FLOW_DEFAULT_RATE]) 820 pr_warn_ratelimited("sch_fq: defrate %u ignored.\n", 821 nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE])); 822 823 if (tb[TCA_FQ_FLOW_MAX_RATE]) { 824 u32 rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); 825 826 q->flow_max_rate = (rate == ~0U) ? ~0UL : rate; 827 } 828 if (tb[TCA_FQ_LOW_RATE_THRESHOLD]) 829 q->low_rate_threshold = 830 nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]); 831 832 if (tb[TCA_FQ_RATE_ENABLE]) { 833 u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]); 834 835 if (enable <= 1) 836 q->rate_enable = enable; 837 else 838 err = -EINVAL; 839 } 840 841 if (tb[TCA_FQ_FLOW_REFILL_DELAY]) { 842 u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ; 843 844 q->flow_refill_delay = usecs_to_jiffies(usecs_delay); 845 } 846 847 if (tb[TCA_FQ_ORPHAN_MASK]) 848 q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]); 849 850 if (tb[TCA_FQ_CE_THRESHOLD]) 851 q->ce_threshold = (u64)NSEC_PER_USEC * 852 nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]); 853 854 if (tb[TCA_FQ_TIMER_SLACK]) 855 q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]); 856 857 if (!err) { 858 sch_tree_unlock(sch); 859 err = fq_resize(sch, fq_log); 860 sch_tree_lock(sch); 861 } 862 while (sch->q.qlen > sch->limit) { 863 struct sk_buff *skb = fq_dequeue(sch); 864 865 if (!skb) 866 break; 867 drop_len += qdisc_pkt_len(skb); 868 rtnl_kfree_skbs(skb, skb); 869 drop_count++; 870 } 871 qdisc_tree_reduce_backlog(sch, drop_count, drop_len); 872 873 sch_tree_unlock(sch); 874 return err; 875 } 876 877 static void fq_destroy(struct Qdisc *sch) 878 { 879 struct fq_sched_data *q = qdisc_priv(sch); 880 881 fq_reset(sch); 882 fq_free(q->fq_root); 883 qdisc_watchdog_cancel(&q->watchdog); 884 } 885 886 static int fq_init(struct Qdisc *sch, struct nlattr *opt, 887 struct netlink_ext_ack *extack) 888 { 889 struct fq_sched_data *q = qdisc_priv(sch); 890 int err; 891 892 sch->limit = 10000; 893 q->flow_plimit = 100; 894 q->quantum = 2 * psched_mtu(qdisc_dev(sch)); 895 q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch)); 896 q->flow_refill_delay = msecs_to_jiffies(40); 897 q->flow_max_rate = ~0UL; 898 q->time_next_delayed_flow = ~0ULL; 899 q->rate_enable = 1; 900 q->new_flows.first = NULL; 901 q->old_flows.first = NULL; 902 q->delayed = RB_ROOT; 903 q->fq_root = NULL; 904 q->fq_trees_log = ilog2(1024); 905 q->orphan_mask = 1024 - 1; 906 q->low_rate_threshold = 550000 / 8; 907 908 q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */ 909 910 /* Default ce_threshold of 4294 seconds */ 911 q->ce_threshold = (u64)NSEC_PER_USEC * ~0U; 912 913 qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC); 914 915 if (opt) 916 err = fq_change(sch, opt, extack); 917 else 918 err = fq_resize(sch, q->fq_trees_log); 919 920 return err; 921 } 922 923 static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) 924 { 925 struct fq_sched_data *q = qdisc_priv(sch); 926 u64 ce_threshold = q->ce_threshold; 927 struct nlattr *opts; 928 929 opts = nla_nest_start_noflag(skb, TCA_OPTIONS); 930 if (opts == NULL) 931 goto nla_put_failure; 932 933 /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */ 934 935 do_div(ce_threshold, NSEC_PER_USEC); 936 937 if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) || 938 nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) || 939 nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) || 940 nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) || 941 nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) || 942 nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, 943 min_t(unsigned long, q->flow_max_rate, ~0U)) || 944 nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY, 945 jiffies_to_usecs(q->flow_refill_delay)) || 946 nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) || 947 nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD, 948 q->low_rate_threshold) || 949 nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) || 950 nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) || 951 nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack)) 952 goto nla_put_failure; 953 954 return nla_nest_end(skb, opts); 955 956 nla_put_failure: 957 return -1; 958 } 959 960 static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) 961 { 962 struct fq_sched_data *q = qdisc_priv(sch); 963 struct tc_fq_qd_stats st; 964 965 sch_tree_lock(sch); 966 967 st.gc_flows = q->stat_gc_flows; 968 st.highprio_packets = q->stat_internal_packets; 969 st.tcp_retrans = 0; 970 st.throttled = q->stat_throttled; 971 st.flows_plimit = q->stat_flows_plimit; 972 st.pkts_too_long = q->stat_pkts_too_long; 973 st.allocation_errors = q->stat_allocation_errors; 974 st.time_next_delayed_flow = q->time_next_delayed_flow + q->timer_slack - 975 ktime_get_ns(); 976 st.flows = q->flows; 977 st.inactive_flows = q->inactive_flows; 978 st.throttled_flows = q->throttled_flows; 979 st.unthrottle_latency_ns = min_t(unsigned long, 980 q->unthrottle_latency_ns, ~0U); 981 st.ce_mark = q->stat_ce_mark; 982 sch_tree_unlock(sch); 983 984 return gnet_stats_copy_app(d, &st, sizeof(st)); 985 } 986 987 static struct Qdisc_ops fq_qdisc_ops __read_mostly = { 988 .id = "fq", 989 .priv_size = sizeof(struct fq_sched_data), 990 991 .enqueue = fq_enqueue, 992 .dequeue = fq_dequeue, 993 .peek = qdisc_peek_dequeued, 994 .init = fq_init, 995 .reset = fq_reset, 996 .destroy = fq_destroy, 997 .change = fq_change, 998 .dump = fq_dump, 999 .dump_stats = fq_dump_stats, 1000 .owner = THIS_MODULE, 1001 }; 1002 1003 static int __init fq_module_init(void) 1004 { 1005 int ret; 1006 1007 fq_flow_cachep = kmem_cache_create("fq_flow_cache", 1008 sizeof(struct fq_flow), 1009 0, 0, NULL); 1010 if (!fq_flow_cachep) 1011 return -ENOMEM; 1012 1013 ret = register_qdisc(&fq_qdisc_ops); 1014 if (ret) 1015 kmem_cache_destroy(fq_flow_cachep); 1016 return ret; 1017 } 1018 1019 static void __exit fq_module_exit(void) 1020 { 1021 unregister_qdisc(&fq_qdisc_ops); 1022 kmem_cache_destroy(fq_flow_cachep); 1023 } 1024 1025 module_init(fq_module_init) 1026 module_exit(fq_module_exit) 1027 MODULE_AUTHOR("Eric Dumazet"); 1028 MODULE_LICENSE("GPL"); 1029