1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing) 4 * 5 * Copyright (C) 2013-2015 Eric Dumazet <edumazet@google.com> 6 * 7 * Meant to be mostly used for locally generated traffic : 8 * Fast classification depends on skb->sk being set before reaching us. 9 * If not, (router workload), we use rxhash as fallback, with 32 bits wide hash. 10 * All packets belonging to a socket are considered as a 'flow'. 11 * 12 * Flows are dynamically allocated and stored in a hash table of RB trees 13 * They are also part of one Round Robin 'queues' (new or old flows) 14 * 15 * Burst avoidance (aka pacing) capability : 16 * 17 * Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a 18 * bunch of packets, and this packet scheduler adds delay between 19 * packets to respect rate limitation. 20 * 21 * enqueue() : 22 * - lookup one RB tree (out of 1024 or more) to find the flow. 23 * If non existent flow, create it, add it to the tree. 24 * Add skb to the per flow list of skb (fifo). 25 * - Use a special fifo for high prio packets 26 * 27 * dequeue() : serves flows in Round Robin 28 * Note : When a flow becomes empty, we do not immediately remove it from 29 * rb trees, for performance reasons (its expected to send additional packets, 30 * or SLAB cache will reuse socket for another flow) 31 */ 32 33 #include <linux/module.h> 34 #include <linux/types.h> 35 #include <linux/kernel.h> 36 #include <linux/jiffies.h> 37 #include <linux/string.h> 38 #include <linux/in.h> 39 #include <linux/errno.h> 40 #include <linux/init.h> 41 #include <linux/skbuff.h> 42 #include <linux/slab.h> 43 #include <linux/rbtree.h> 44 #include <linux/hash.h> 45 #include <linux/prefetch.h> 46 #include <linux/vmalloc.h> 47 #include <net/netlink.h> 48 #include <net/pkt_sched.h> 49 #include <net/sock.h> 50 #include <net/tcp_states.h> 51 #include <net/tcp.h> 52 53 struct fq_skb_cb { 54 u64 time_to_send; 55 }; 56 57 static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb) 58 { 59 qdisc_cb_private_validate(skb, sizeof(struct fq_skb_cb)); 60 return (struct fq_skb_cb *)qdisc_skb_cb(skb)->data; 61 } 62 63 /* 64 * Per flow structure, dynamically allocated. 65 * If packets have monotically increasing time_to_send, they are placed in O(1) 66 * in linear list (head,tail), otherwise are placed in a rbtree (t_root). 67 */ 68 struct fq_flow { 69 /* First cache line : used in fq_gc(), fq_enqueue(), fq_dequeue() */ 70 struct rb_root t_root; 71 struct sk_buff *head; /* list of skbs for this flow : first skb */ 72 union { 73 struct sk_buff *tail; /* last skb in the list */ 74 unsigned long age; /* (jiffies | 1UL) when flow was emptied, for gc */ 75 }; 76 struct rb_node fq_node; /* anchor in fq_root[] trees */ 77 struct sock *sk; 78 u32 socket_hash; /* sk_hash */ 79 int qlen; /* number of packets in flow queue */ 80 81 /* Second cache line, used in fq_dequeue() */ 82 int credit; 83 /* 32bit hole on 64bit arches */ 84 85 struct fq_flow *next; /* next pointer in RR lists */ 86 87 struct rb_node rate_node; /* anchor in q->delayed tree */ 88 u64 time_next_packet; 89 } ____cacheline_aligned_in_smp; 90 91 struct fq_flow_head { 92 struct fq_flow *first; 93 struct fq_flow *last; 94 }; 95 96 struct fq_sched_data { 97 struct fq_flow_head new_flows; 98 99 struct fq_flow_head old_flows; 100 101 struct rb_root delayed; /* for rate limited flows */ 102 u64 time_next_delayed_flow; 103 u64 ktime_cache; /* copy of last ktime_get_ns() */ 104 unsigned long unthrottle_latency_ns; 105 106 struct fq_flow internal; /* for non classified or high prio packets */ 107 u32 quantum; 108 u32 initial_quantum; 109 u32 flow_refill_delay; 110 u32 flow_plimit; /* max packets per flow */ 111 unsigned long flow_max_rate; /* optional max rate per flow */ 112 u64 ce_threshold; 113 u64 horizon; /* horizon in ns */ 114 u32 orphan_mask; /* mask for orphaned skb */ 115 u32 low_rate_threshold; 116 struct rb_root *fq_root; 117 u8 rate_enable; 118 u8 fq_trees_log; 119 u8 horizon_drop; 120 u32 flows; 121 u32 inactive_flows; 122 u32 throttled_flows; 123 124 u64 stat_gc_flows; 125 u64 stat_internal_packets; 126 u64 stat_throttled; 127 u64 stat_ce_mark; 128 u64 stat_horizon_drops; 129 u64 stat_horizon_caps; 130 u64 stat_flows_plimit; 131 u64 stat_pkts_too_long; 132 u64 stat_allocation_errors; 133 134 u32 timer_slack; /* hrtimer slack in ns */ 135 struct qdisc_watchdog watchdog; 136 }; 137 138 /* 139 * f->tail and f->age share the same location. 140 * We can use the low order bit to differentiate if this location points 141 * to a sk_buff or contains a jiffies value, if we force this value to be odd. 142 * This assumes f->tail low order bit must be 0 since alignof(struct sk_buff) >= 2 143 */ 144 static void fq_flow_set_detached(struct fq_flow *f) 145 { 146 f->age = jiffies | 1UL; 147 } 148 149 static bool fq_flow_is_detached(const struct fq_flow *f) 150 { 151 return !!(f->age & 1UL); 152 } 153 154 /* special value to mark a throttled flow (not on old/new list) */ 155 static struct fq_flow throttled; 156 157 static bool fq_flow_is_throttled(const struct fq_flow *f) 158 { 159 return f->next == &throttled; 160 } 161 162 static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow) 163 { 164 if (head->first) 165 head->last->next = flow; 166 else 167 head->first = flow; 168 head->last = flow; 169 flow->next = NULL; 170 } 171 172 static void fq_flow_unset_throttled(struct fq_sched_data *q, struct fq_flow *f) 173 { 174 rb_erase(&f->rate_node, &q->delayed); 175 q->throttled_flows--; 176 fq_flow_add_tail(&q->old_flows, f); 177 } 178 179 static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f) 180 { 181 struct rb_node **p = &q->delayed.rb_node, *parent = NULL; 182 183 while (*p) { 184 struct fq_flow *aux; 185 186 parent = *p; 187 aux = rb_entry(parent, struct fq_flow, rate_node); 188 if (f->time_next_packet >= aux->time_next_packet) 189 p = &parent->rb_right; 190 else 191 p = &parent->rb_left; 192 } 193 rb_link_node(&f->rate_node, parent, p); 194 rb_insert_color(&f->rate_node, &q->delayed); 195 q->throttled_flows++; 196 q->stat_throttled++; 197 198 f->next = &throttled; 199 if (q->time_next_delayed_flow > f->time_next_packet) 200 q->time_next_delayed_flow = f->time_next_packet; 201 } 202 203 204 static struct kmem_cache *fq_flow_cachep __read_mostly; 205 206 207 /* limit number of collected flows per round */ 208 #define FQ_GC_MAX 8 209 #define FQ_GC_AGE (3*HZ) 210 211 static bool fq_gc_candidate(const struct fq_flow *f) 212 { 213 return fq_flow_is_detached(f) && 214 time_after(jiffies, f->age + FQ_GC_AGE); 215 } 216 217 static void fq_gc(struct fq_sched_data *q, 218 struct rb_root *root, 219 struct sock *sk) 220 { 221 struct rb_node **p, *parent; 222 void *tofree[FQ_GC_MAX]; 223 struct fq_flow *f; 224 int i, fcnt = 0; 225 226 p = &root->rb_node; 227 parent = NULL; 228 while (*p) { 229 parent = *p; 230 231 f = rb_entry(parent, struct fq_flow, fq_node); 232 if (f->sk == sk) 233 break; 234 235 if (fq_gc_candidate(f)) { 236 tofree[fcnt++] = f; 237 if (fcnt == FQ_GC_MAX) 238 break; 239 } 240 241 if (f->sk > sk) 242 p = &parent->rb_right; 243 else 244 p = &parent->rb_left; 245 } 246 247 if (!fcnt) 248 return; 249 250 for (i = fcnt; i > 0; ) { 251 f = tofree[--i]; 252 rb_erase(&f->fq_node, root); 253 } 254 q->flows -= fcnt; 255 q->inactive_flows -= fcnt; 256 q->stat_gc_flows += fcnt; 257 258 kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree); 259 } 260 261 static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) 262 { 263 struct rb_node **p, *parent; 264 struct sock *sk = skb->sk; 265 struct rb_root *root; 266 struct fq_flow *f; 267 268 /* warning: no starvation prevention... */ 269 if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) 270 return &q->internal; 271 272 /* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket 273 * or a listener (SYNCOOKIE mode) 274 * 1) request sockets are not full blown, 275 * they do not contain sk_pacing_rate 276 * 2) They are not part of a 'flow' yet 277 * 3) We do not want to rate limit them (eg SYNFLOOD attack), 278 * especially if the listener set SO_MAX_PACING_RATE 279 * 4) We pretend they are orphaned 280 */ 281 if (!sk || sk_listener(sk)) { 282 unsigned long hash = skb_get_hash(skb) & q->orphan_mask; 283 284 /* By forcing low order bit to 1, we make sure to not 285 * collide with a local flow (socket pointers are word aligned) 286 */ 287 sk = (struct sock *)((hash << 1) | 1UL); 288 skb_orphan(skb); 289 } else if (sk->sk_state == TCP_CLOSE) { 290 unsigned long hash = skb_get_hash(skb) & q->orphan_mask; 291 /* 292 * Sockets in TCP_CLOSE are non connected. 293 * Typical use case is UDP sockets, they can send packets 294 * with sendto() to many different destinations. 295 * We probably could use a generic bit advertising 296 * non connected sockets, instead of sk_state == TCP_CLOSE, 297 * if we care enough. 298 */ 299 sk = (struct sock *)((hash << 1) | 1UL); 300 } 301 302 root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)]; 303 304 if (q->flows >= (2U << q->fq_trees_log) && 305 q->inactive_flows > q->flows/2) 306 fq_gc(q, root, sk); 307 308 p = &root->rb_node; 309 parent = NULL; 310 while (*p) { 311 parent = *p; 312 313 f = rb_entry(parent, struct fq_flow, fq_node); 314 if (f->sk == sk) { 315 /* socket might have been reallocated, so check 316 * if its sk_hash is the same. 317 * It not, we need to refill credit with 318 * initial quantum 319 */ 320 if (unlikely(skb->sk == sk && 321 f->socket_hash != sk->sk_hash)) { 322 f->credit = q->initial_quantum; 323 f->socket_hash = sk->sk_hash; 324 if (q->rate_enable) 325 smp_store_release(&sk->sk_pacing_status, 326 SK_PACING_FQ); 327 if (fq_flow_is_throttled(f)) 328 fq_flow_unset_throttled(q, f); 329 f->time_next_packet = 0ULL; 330 } 331 return f; 332 } 333 if (f->sk > sk) 334 p = &parent->rb_right; 335 else 336 p = &parent->rb_left; 337 } 338 339 f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN); 340 if (unlikely(!f)) { 341 q->stat_allocation_errors++; 342 return &q->internal; 343 } 344 /* f->t_root is already zeroed after kmem_cache_zalloc() */ 345 346 fq_flow_set_detached(f); 347 f->sk = sk; 348 if (skb->sk == sk) { 349 f->socket_hash = sk->sk_hash; 350 if (q->rate_enable) 351 smp_store_release(&sk->sk_pacing_status, 352 SK_PACING_FQ); 353 } 354 f->credit = q->initial_quantum; 355 356 rb_link_node(&f->fq_node, parent, p); 357 rb_insert_color(&f->fq_node, root); 358 359 q->flows++; 360 q->inactive_flows++; 361 return f; 362 } 363 364 static struct sk_buff *fq_peek(struct fq_flow *flow) 365 { 366 struct sk_buff *skb = skb_rb_first(&flow->t_root); 367 struct sk_buff *head = flow->head; 368 369 if (!skb) 370 return head; 371 372 if (!head) 373 return skb; 374 375 if (fq_skb_cb(skb)->time_to_send < fq_skb_cb(head)->time_to_send) 376 return skb; 377 return head; 378 } 379 380 static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow, 381 struct sk_buff *skb) 382 { 383 if (skb == flow->head) { 384 flow->head = skb->next; 385 } else { 386 rb_erase(&skb->rbnode, &flow->t_root); 387 skb->dev = qdisc_dev(sch); 388 } 389 } 390 391 /* Remove one skb from flow queue. 392 * This skb must be the return value of prior fq_peek(). 393 */ 394 static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow, 395 struct sk_buff *skb) 396 { 397 fq_erase_head(sch, flow, skb); 398 skb_mark_not_on_list(skb); 399 flow->qlen--; 400 qdisc_qstats_backlog_dec(sch, skb); 401 sch->q.qlen--; 402 } 403 404 static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) 405 { 406 struct rb_node **p, *parent; 407 struct sk_buff *head, *aux; 408 409 head = flow->head; 410 if (!head || 411 fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) { 412 if (!head) 413 flow->head = skb; 414 else 415 flow->tail->next = skb; 416 flow->tail = skb; 417 skb->next = NULL; 418 return; 419 } 420 421 p = &flow->t_root.rb_node; 422 parent = NULL; 423 424 while (*p) { 425 parent = *p; 426 aux = rb_to_skb(parent); 427 if (fq_skb_cb(skb)->time_to_send >= fq_skb_cb(aux)->time_to_send) 428 p = &parent->rb_right; 429 else 430 p = &parent->rb_left; 431 } 432 rb_link_node(&skb->rbnode, parent, p); 433 rb_insert_color(&skb->rbnode, &flow->t_root); 434 } 435 436 static bool fq_packet_beyond_horizon(const struct sk_buff *skb, 437 const struct fq_sched_data *q) 438 { 439 return unlikely((s64)skb->tstamp > (s64)(q->ktime_cache + q->horizon)); 440 } 441 442 static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, 443 struct sk_buff **to_free) 444 { 445 struct fq_sched_data *q = qdisc_priv(sch); 446 struct fq_flow *f; 447 448 if (unlikely(sch->q.qlen >= sch->limit)) 449 return qdisc_drop(skb, sch, to_free); 450 451 if (!skb->tstamp) { 452 fq_skb_cb(skb)->time_to_send = q->ktime_cache = ktime_get_ns(); 453 } else { 454 /* Check if packet timestamp is too far in the future. 455 * Try first if our cached value, to avoid ktime_get_ns() 456 * cost in most cases. 457 */ 458 if (fq_packet_beyond_horizon(skb, q)) { 459 /* Refresh our cache and check another time */ 460 q->ktime_cache = ktime_get_ns(); 461 if (fq_packet_beyond_horizon(skb, q)) { 462 if (q->horizon_drop) { 463 q->stat_horizon_drops++; 464 return qdisc_drop(skb, sch, to_free); 465 } 466 q->stat_horizon_caps++; 467 skb->tstamp = q->ktime_cache + q->horizon; 468 } 469 } 470 fq_skb_cb(skb)->time_to_send = skb->tstamp; 471 } 472 473 f = fq_classify(skb, q); 474 if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) { 475 q->stat_flows_plimit++; 476 return qdisc_drop(skb, sch, to_free); 477 } 478 479 f->qlen++; 480 qdisc_qstats_backlog_inc(sch, skb); 481 if (fq_flow_is_detached(f)) { 482 fq_flow_add_tail(&q->new_flows, f); 483 if (time_after(jiffies, f->age + q->flow_refill_delay)) 484 f->credit = max_t(u32, f->credit, q->quantum); 485 q->inactive_flows--; 486 } 487 488 /* Note: this overwrites f->age */ 489 flow_queue_add(f, skb); 490 491 if (unlikely(f == &q->internal)) { 492 q->stat_internal_packets++; 493 } 494 sch->q.qlen++; 495 496 return NET_XMIT_SUCCESS; 497 } 498 499 static void fq_check_throttled(struct fq_sched_data *q, u64 now) 500 { 501 unsigned long sample; 502 struct rb_node *p; 503 504 if (q->time_next_delayed_flow > now) 505 return; 506 507 /* Update unthrottle latency EWMA. 508 * This is cheap and can help diagnosing timer/latency problems. 509 */ 510 sample = (unsigned long)(now - q->time_next_delayed_flow); 511 q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3; 512 q->unthrottle_latency_ns += sample >> 3; 513 514 q->time_next_delayed_flow = ~0ULL; 515 while ((p = rb_first(&q->delayed)) != NULL) { 516 struct fq_flow *f = rb_entry(p, struct fq_flow, rate_node); 517 518 if (f->time_next_packet > now) { 519 q->time_next_delayed_flow = f->time_next_packet; 520 break; 521 } 522 fq_flow_unset_throttled(q, f); 523 } 524 } 525 526 static struct sk_buff *fq_dequeue(struct Qdisc *sch) 527 { 528 struct fq_sched_data *q = qdisc_priv(sch); 529 struct fq_flow_head *head; 530 struct sk_buff *skb; 531 struct fq_flow *f; 532 unsigned long rate; 533 u32 plen; 534 u64 now; 535 536 if (!sch->q.qlen) 537 return NULL; 538 539 skb = fq_peek(&q->internal); 540 if (unlikely(skb)) { 541 fq_dequeue_skb(sch, &q->internal, skb); 542 goto out; 543 } 544 545 q->ktime_cache = now = ktime_get_ns(); 546 fq_check_throttled(q, now); 547 begin: 548 head = &q->new_flows; 549 if (!head->first) { 550 head = &q->old_flows; 551 if (!head->first) { 552 if (q->time_next_delayed_flow != ~0ULL) 553 qdisc_watchdog_schedule_range_ns(&q->watchdog, 554 q->time_next_delayed_flow, 555 q->timer_slack); 556 return NULL; 557 } 558 } 559 f = head->first; 560 561 if (f->credit <= 0) { 562 f->credit += q->quantum; 563 head->first = f->next; 564 fq_flow_add_tail(&q->old_flows, f); 565 goto begin; 566 } 567 568 skb = fq_peek(f); 569 if (skb) { 570 u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send, 571 f->time_next_packet); 572 573 if (now < time_next_packet) { 574 head->first = f->next; 575 f->time_next_packet = time_next_packet; 576 fq_flow_set_throttled(q, f); 577 goto begin; 578 } 579 prefetch(&skb->end); 580 if ((s64)(now - time_next_packet - q->ce_threshold) > 0) { 581 INET_ECN_set_ce(skb); 582 q->stat_ce_mark++; 583 } 584 fq_dequeue_skb(sch, f, skb); 585 } else { 586 head->first = f->next; 587 /* force a pass through old_flows to prevent starvation */ 588 if ((head == &q->new_flows) && q->old_flows.first) { 589 fq_flow_add_tail(&q->old_flows, f); 590 } else { 591 fq_flow_set_detached(f); 592 q->inactive_flows++; 593 } 594 goto begin; 595 } 596 plen = qdisc_pkt_len(skb); 597 f->credit -= plen; 598 599 if (!q->rate_enable) 600 goto out; 601 602 rate = q->flow_max_rate; 603 604 /* If EDT time was provided for this skb, we need to 605 * update f->time_next_packet only if this qdisc enforces 606 * a flow max rate. 607 */ 608 if (!skb->tstamp) { 609 if (skb->sk) 610 rate = min(skb->sk->sk_pacing_rate, rate); 611 612 if (rate <= q->low_rate_threshold) { 613 f->credit = 0; 614 } else { 615 plen = max(plen, q->quantum); 616 if (f->credit > 0) 617 goto out; 618 } 619 } 620 if (rate != ~0UL) { 621 u64 len = (u64)plen * NSEC_PER_SEC; 622 623 if (likely(rate)) 624 len = div64_ul(len, rate); 625 /* Since socket rate can change later, 626 * clamp the delay to 1 second. 627 * Really, providers of too big packets should be fixed ! 628 */ 629 if (unlikely(len > NSEC_PER_SEC)) { 630 len = NSEC_PER_SEC; 631 q->stat_pkts_too_long++; 632 } 633 /* Account for schedule/timers drifts. 634 * f->time_next_packet was set when prior packet was sent, 635 * and current time (@now) can be too late by tens of us. 636 */ 637 if (f->time_next_packet) 638 len -= min(len/2, now - f->time_next_packet); 639 f->time_next_packet = now + len; 640 } 641 out: 642 qdisc_bstats_update(sch, skb); 643 return skb; 644 } 645 646 static void fq_flow_purge(struct fq_flow *flow) 647 { 648 struct rb_node *p = rb_first(&flow->t_root); 649 650 while (p) { 651 struct sk_buff *skb = rb_to_skb(p); 652 653 p = rb_next(p); 654 rb_erase(&skb->rbnode, &flow->t_root); 655 rtnl_kfree_skbs(skb, skb); 656 } 657 rtnl_kfree_skbs(flow->head, flow->tail); 658 flow->head = NULL; 659 flow->qlen = 0; 660 } 661 662 static void fq_reset(struct Qdisc *sch) 663 { 664 struct fq_sched_data *q = qdisc_priv(sch); 665 struct rb_root *root; 666 struct rb_node *p; 667 struct fq_flow *f; 668 unsigned int idx; 669 670 sch->q.qlen = 0; 671 sch->qstats.backlog = 0; 672 673 fq_flow_purge(&q->internal); 674 675 if (!q->fq_root) 676 return; 677 678 for (idx = 0; idx < (1U << q->fq_trees_log); idx++) { 679 root = &q->fq_root[idx]; 680 while ((p = rb_first(root)) != NULL) { 681 f = rb_entry(p, struct fq_flow, fq_node); 682 rb_erase(p, root); 683 684 fq_flow_purge(f); 685 686 kmem_cache_free(fq_flow_cachep, f); 687 } 688 } 689 q->new_flows.first = NULL; 690 q->old_flows.first = NULL; 691 q->delayed = RB_ROOT; 692 q->flows = 0; 693 q->inactive_flows = 0; 694 q->throttled_flows = 0; 695 } 696 697 static void fq_rehash(struct fq_sched_data *q, 698 struct rb_root *old_array, u32 old_log, 699 struct rb_root *new_array, u32 new_log) 700 { 701 struct rb_node *op, **np, *parent; 702 struct rb_root *oroot, *nroot; 703 struct fq_flow *of, *nf; 704 int fcnt = 0; 705 u32 idx; 706 707 for (idx = 0; idx < (1U << old_log); idx++) { 708 oroot = &old_array[idx]; 709 while ((op = rb_first(oroot)) != NULL) { 710 rb_erase(op, oroot); 711 of = rb_entry(op, struct fq_flow, fq_node); 712 if (fq_gc_candidate(of)) { 713 fcnt++; 714 kmem_cache_free(fq_flow_cachep, of); 715 continue; 716 } 717 nroot = &new_array[hash_ptr(of->sk, new_log)]; 718 719 np = &nroot->rb_node; 720 parent = NULL; 721 while (*np) { 722 parent = *np; 723 724 nf = rb_entry(parent, struct fq_flow, fq_node); 725 BUG_ON(nf->sk == of->sk); 726 727 if (nf->sk > of->sk) 728 np = &parent->rb_right; 729 else 730 np = &parent->rb_left; 731 } 732 733 rb_link_node(&of->fq_node, parent, np); 734 rb_insert_color(&of->fq_node, nroot); 735 } 736 } 737 q->flows -= fcnt; 738 q->inactive_flows -= fcnt; 739 q->stat_gc_flows += fcnt; 740 } 741 742 static void fq_free(void *addr) 743 { 744 kvfree(addr); 745 } 746 747 static int fq_resize(struct Qdisc *sch, u32 log) 748 { 749 struct fq_sched_data *q = qdisc_priv(sch); 750 struct rb_root *array; 751 void *old_fq_root; 752 u32 idx; 753 754 if (q->fq_root && log == q->fq_trees_log) 755 return 0; 756 757 /* If XPS was setup, we can allocate memory on right NUMA node */ 758 array = kvmalloc_node(sizeof(struct rb_root) << log, GFP_KERNEL | __GFP_RETRY_MAYFAIL, 759 netdev_queue_numa_node_read(sch->dev_queue)); 760 if (!array) 761 return -ENOMEM; 762 763 for (idx = 0; idx < (1U << log); idx++) 764 array[idx] = RB_ROOT; 765 766 sch_tree_lock(sch); 767 768 old_fq_root = q->fq_root; 769 if (old_fq_root) 770 fq_rehash(q, old_fq_root, q->fq_trees_log, array, log); 771 772 q->fq_root = array; 773 q->fq_trees_log = log; 774 775 sch_tree_unlock(sch); 776 777 fq_free(old_fq_root); 778 779 return 0; 780 } 781 782 static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { 783 [TCA_FQ_UNSPEC] = { .strict_start_type = TCA_FQ_TIMER_SLACK }, 784 785 [TCA_FQ_PLIMIT] = { .type = NLA_U32 }, 786 [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 }, 787 [TCA_FQ_QUANTUM] = { .type = NLA_U32 }, 788 [TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 }, 789 [TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 }, 790 [TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 }, 791 [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 }, 792 [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 }, 793 [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 }, 794 [TCA_FQ_ORPHAN_MASK] = { .type = NLA_U32 }, 795 [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 }, 796 [TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 }, 797 [TCA_FQ_TIMER_SLACK] = { .type = NLA_U32 }, 798 [TCA_FQ_HORIZON] = { .type = NLA_U32 }, 799 [TCA_FQ_HORIZON_DROP] = { .type = NLA_U8 }, 800 }; 801 802 static int fq_change(struct Qdisc *sch, struct nlattr *opt, 803 struct netlink_ext_ack *extack) 804 { 805 struct fq_sched_data *q = qdisc_priv(sch); 806 struct nlattr *tb[TCA_FQ_MAX + 1]; 807 int err, drop_count = 0; 808 unsigned drop_len = 0; 809 u32 fq_log; 810 811 err = nla_parse_nested_deprecated(tb, TCA_FQ_MAX, opt, fq_policy, 812 NULL); 813 if (err < 0) 814 return err; 815 816 sch_tree_lock(sch); 817 818 fq_log = q->fq_trees_log; 819 820 if (tb[TCA_FQ_BUCKETS_LOG]) { 821 u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]); 822 823 if (nval >= 1 && nval <= ilog2(256*1024)) 824 fq_log = nval; 825 else 826 err = -EINVAL; 827 } 828 if (tb[TCA_FQ_PLIMIT]) 829 sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]); 830 831 if (tb[TCA_FQ_FLOW_PLIMIT]) 832 q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]); 833 834 if (tb[TCA_FQ_QUANTUM]) { 835 u32 quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]); 836 837 if (quantum > 0 && quantum <= (1 << 20)) { 838 q->quantum = quantum; 839 } else { 840 NL_SET_ERR_MSG_MOD(extack, "invalid quantum"); 841 err = -EINVAL; 842 } 843 } 844 845 if (tb[TCA_FQ_INITIAL_QUANTUM]) 846 q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]); 847 848 if (tb[TCA_FQ_FLOW_DEFAULT_RATE]) 849 pr_warn_ratelimited("sch_fq: defrate %u ignored.\n", 850 nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE])); 851 852 if (tb[TCA_FQ_FLOW_MAX_RATE]) { 853 u32 rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); 854 855 q->flow_max_rate = (rate == ~0U) ? ~0UL : rate; 856 } 857 if (tb[TCA_FQ_LOW_RATE_THRESHOLD]) 858 q->low_rate_threshold = 859 nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]); 860 861 if (tb[TCA_FQ_RATE_ENABLE]) { 862 u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]); 863 864 if (enable <= 1) 865 q->rate_enable = enable; 866 else 867 err = -EINVAL; 868 } 869 870 if (tb[TCA_FQ_FLOW_REFILL_DELAY]) { 871 u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ; 872 873 q->flow_refill_delay = usecs_to_jiffies(usecs_delay); 874 } 875 876 if (tb[TCA_FQ_ORPHAN_MASK]) 877 q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]); 878 879 if (tb[TCA_FQ_CE_THRESHOLD]) 880 q->ce_threshold = (u64)NSEC_PER_USEC * 881 nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]); 882 883 if (tb[TCA_FQ_TIMER_SLACK]) 884 q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]); 885 886 if (tb[TCA_FQ_HORIZON]) 887 q->horizon = (u64)NSEC_PER_USEC * 888 nla_get_u32(tb[TCA_FQ_HORIZON]); 889 890 if (tb[TCA_FQ_HORIZON_DROP]) 891 q->horizon_drop = nla_get_u8(tb[TCA_FQ_HORIZON_DROP]); 892 893 if (!err) { 894 895 sch_tree_unlock(sch); 896 err = fq_resize(sch, fq_log); 897 sch_tree_lock(sch); 898 } 899 while (sch->q.qlen > sch->limit) { 900 struct sk_buff *skb = fq_dequeue(sch); 901 902 if (!skb) 903 break; 904 drop_len += qdisc_pkt_len(skb); 905 rtnl_kfree_skbs(skb, skb); 906 drop_count++; 907 } 908 qdisc_tree_reduce_backlog(sch, drop_count, drop_len); 909 910 sch_tree_unlock(sch); 911 return err; 912 } 913 914 static void fq_destroy(struct Qdisc *sch) 915 { 916 struct fq_sched_data *q = qdisc_priv(sch); 917 918 fq_reset(sch); 919 fq_free(q->fq_root); 920 qdisc_watchdog_cancel(&q->watchdog); 921 } 922 923 static int fq_init(struct Qdisc *sch, struct nlattr *opt, 924 struct netlink_ext_ack *extack) 925 { 926 struct fq_sched_data *q = qdisc_priv(sch); 927 int err; 928 929 sch->limit = 10000; 930 q->flow_plimit = 100; 931 q->quantum = 2 * psched_mtu(qdisc_dev(sch)); 932 q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch)); 933 q->flow_refill_delay = msecs_to_jiffies(40); 934 q->flow_max_rate = ~0UL; 935 q->time_next_delayed_flow = ~0ULL; 936 q->rate_enable = 1; 937 q->new_flows.first = NULL; 938 q->old_flows.first = NULL; 939 q->delayed = RB_ROOT; 940 q->fq_root = NULL; 941 q->fq_trees_log = ilog2(1024); 942 q->orphan_mask = 1024 - 1; 943 q->low_rate_threshold = 550000 / 8; 944 945 q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */ 946 947 q->horizon = 10ULL * NSEC_PER_SEC; /* 10 seconds */ 948 q->horizon_drop = 1; /* by default, drop packets beyond horizon */ 949 950 /* Default ce_threshold of 4294 seconds */ 951 q->ce_threshold = (u64)NSEC_PER_USEC * ~0U; 952 953 qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC); 954 955 if (opt) 956 err = fq_change(sch, opt, extack); 957 else 958 err = fq_resize(sch, q->fq_trees_log); 959 960 return err; 961 } 962 963 static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) 964 { 965 struct fq_sched_data *q = qdisc_priv(sch); 966 u64 ce_threshold = q->ce_threshold; 967 u64 horizon = q->horizon; 968 struct nlattr *opts; 969 970 opts = nla_nest_start_noflag(skb, TCA_OPTIONS); 971 if (opts == NULL) 972 goto nla_put_failure; 973 974 /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */ 975 976 do_div(ce_threshold, NSEC_PER_USEC); 977 do_div(horizon, NSEC_PER_USEC); 978 979 if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) || 980 nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) || 981 nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) || 982 nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) || 983 nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) || 984 nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, 985 min_t(unsigned long, q->flow_max_rate, ~0U)) || 986 nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY, 987 jiffies_to_usecs(q->flow_refill_delay)) || 988 nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) || 989 nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD, 990 q->low_rate_threshold) || 991 nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) || 992 nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) || 993 nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack) || 994 nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) || 995 nla_put_u8(skb, TCA_FQ_HORIZON_DROP, q->horizon_drop)) 996 goto nla_put_failure; 997 998 return nla_nest_end(skb, opts); 999 1000 nla_put_failure: 1001 return -1; 1002 } 1003 1004 static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) 1005 { 1006 struct fq_sched_data *q = qdisc_priv(sch); 1007 struct tc_fq_qd_stats st; 1008 1009 sch_tree_lock(sch); 1010 1011 st.gc_flows = q->stat_gc_flows; 1012 st.highprio_packets = q->stat_internal_packets; 1013 st.tcp_retrans = 0; 1014 st.throttled = q->stat_throttled; 1015 st.flows_plimit = q->stat_flows_plimit; 1016 st.pkts_too_long = q->stat_pkts_too_long; 1017 st.allocation_errors = q->stat_allocation_errors; 1018 st.time_next_delayed_flow = q->time_next_delayed_flow + q->timer_slack - 1019 ktime_get_ns(); 1020 st.flows = q->flows; 1021 st.inactive_flows = q->inactive_flows; 1022 st.throttled_flows = q->throttled_flows; 1023 st.unthrottle_latency_ns = min_t(unsigned long, 1024 q->unthrottle_latency_ns, ~0U); 1025 st.ce_mark = q->stat_ce_mark; 1026 st.horizon_drops = q->stat_horizon_drops; 1027 st.horizon_caps = q->stat_horizon_caps; 1028 sch_tree_unlock(sch); 1029 1030 return gnet_stats_copy_app(d, &st, sizeof(st)); 1031 } 1032 1033 static struct Qdisc_ops fq_qdisc_ops __read_mostly = { 1034 .id = "fq", 1035 .priv_size = sizeof(struct fq_sched_data), 1036 1037 .enqueue = fq_enqueue, 1038 .dequeue = fq_dequeue, 1039 .peek = qdisc_peek_dequeued, 1040 .init = fq_init, 1041 .reset = fq_reset, 1042 .destroy = fq_destroy, 1043 .change = fq_change, 1044 .dump = fq_dump, 1045 .dump_stats = fq_dump_stats, 1046 .owner = THIS_MODULE, 1047 }; 1048 1049 static int __init fq_module_init(void) 1050 { 1051 int ret; 1052 1053 fq_flow_cachep = kmem_cache_create("fq_flow_cache", 1054 sizeof(struct fq_flow), 1055 0, 0, NULL); 1056 if (!fq_flow_cachep) 1057 return -ENOMEM; 1058 1059 ret = register_qdisc(&fq_qdisc_ops); 1060 if (ret) 1061 kmem_cache_destroy(fq_flow_cachep); 1062 return ret; 1063 } 1064 1065 static void __exit fq_module_exit(void) 1066 { 1067 unregister_qdisc(&fq_qdisc_ops); 1068 kmem_cache_destroy(fq_flow_cachep); 1069 } 1070 1071 module_init(fq_module_init) 1072 module_exit(fq_module_exit) 1073 MODULE_AUTHOR("Eric Dumazet"); 1074 MODULE_LICENSE("GPL"); 1075 MODULE_DESCRIPTION("Fair Queue Packet Scheduler"); 1076