1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * net/sched/sch_sfq.c Stochastic Fairness Queueing discipline. 4 * 5 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 6 */ 7 8 #include <linux/module.h> 9 #include <linux/types.h> 10 #include <linux/kernel.h> 11 #include <linux/jiffies.h> 12 #include <linux/string.h> 13 #include <linux/in.h> 14 #include <linux/errno.h> 15 #include <linux/init.h> 16 #include <linux/skbuff.h> 17 #include <linux/siphash.h> 18 #include <linux/slab.h> 19 #include <linux/vmalloc.h> 20 #include <net/netlink.h> 21 #include <net/pkt_sched.h> 22 #include <net/pkt_cls.h> 23 #include <net/red.h> 24 25 26 /* Stochastic Fairness Queuing algorithm. 27 ======================================= 28 29 Source: 30 Paul E. McKenney "Stochastic Fairness Queuing", 31 IEEE INFOCOMM'90 Proceedings, San Francisco, 1990. 32 33 Paul E. McKenney "Stochastic Fairness Queuing", 34 "Interworking: Research and Experience", v.2, 1991, p.113-131. 35 36 37 See also: 38 M. Shreedhar and George Varghese "Efficient Fair 39 Queuing using Deficit Round Robin", Proc. SIGCOMM 95. 40 41 42 This is not the thing that is usually called (W)FQ nowadays. 43 It does not use any timestamp mechanism, but instead 44 processes queues in round-robin order. 45 46 ADVANTAGE: 47 48 - It is very cheap. Both CPU and memory requirements are minimal. 49 50 DRAWBACKS: 51 52 - "Stochastic" -> It is not 100% fair. 53 When hash collisions occur, several flows are considered as one. 54 55 - "Round-robin" -> It introduces larger delays than virtual clock 56 based schemes, and should not be used for isolating interactive 57 traffic from non-interactive. It means, that this scheduler 58 should be used as leaf of CBQ or P3, which put interactive traffic 59 to higher priority band. 60 61 We still need true WFQ for top level CSZ, but using WFQ 62 for the best effort traffic is absolutely pointless: 63 SFQ is superior for this purpose. 64 65 IMPLEMENTATION: 66 This implementation limits : 67 - maximal queue length per flow to 127 packets. 68 - max mtu to 2^18-1; 69 - max 65408 flows, 70 - number of hash buckets to 65536. 71 72 It is easy to increase these values, but not in flight. */ 73 74 #define SFQ_MAX_DEPTH 127 /* max number of packets per flow */ 75 #define SFQ_DEFAULT_FLOWS 128 76 #define SFQ_MAX_FLOWS (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */ 77 #define SFQ_EMPTY_SLOT 0xffff 78 #define SFQ_DEFAULT_HASH_DIVISOR 1024 79 80 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */ 81 typedef u16 sfq_index; 82 83 /* 84 * We dont use pointers to save space. 85 * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array 86 * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH] 87 * are 'pointers' to dep[] array 88 */ 89 struct sfq_head { 90 sfq_index next; 91 sfq_index prev; 92 }; 93 94 struct sfq_slot { 95 struct sk_buff *skblist_next; 96 struct sk_buff *skblist_prev; 97 sfq_index qlen; /* number of skbs in skblist */ 98 sfq_index next; /* next slot in sfq RR chain */ 99 struct sfq_head dep; /* anchor in dep[] chains */ 100 unsigned short hash; /* hash value (index in ht[]) */ 101 int allot; /* credit for this slot */ 102 103 unsigned int backlog; 104 struct red_vars vars; 105 }; 106 107 struct sfq_sched_data { 108 /* frequently used fields */ 109 int limit; /* limit of total number of packets in this qdisc */ 110 unsigned int divisor; /* number of slots in hash table */ 111 u8 headdrop; 112 u8 maxdepth; /* limit of packets per flow */ 113 114 siphash_key_t perturbation; 115 u8 cur_depth; /* depth of longest slot */ 116 u8 flags; 117 struct tcf_proto __rcu *filter_list; 118 struct tcf_block *block; 119 sfq_index *ht; /* Hash table ('divisor' slots) */ 120 struct sfq_slot *slots; /* Flows table ('maxflows' entries) */ 121 122 struct red_parms *red_parms; 123 struct tc_sfqred_stats stats; 124 struct sfq_slot *tail; /* current slot in round */ 125 126 struct sfq_head dep[SFQ_MAX_DEPTH + 1]; 127 /* Linked lists of slots, indexed by depth 128 * dep[0] : list of unused flows 129 * dep[1] : list of flows with 1 packet 130 * dep[X] : list of flows with X packets 131 */ 132 133 unsigned int maxflows; /* number of flows in flows array */ 134 int perturb_period; 135 unsigned int quantum; /* Allotment per round: MUST BE >= MTU */ 136 struct timer_list perturb_timer; 137 struct Qdisc *sch; 138 }; 139 140 /* 141 * sfq_head are either in a sfq_slot or in dep[] array 142 */ 143 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val) 144 { 145 if (val < SFQ_MAX_FLOWS) 146 return &q->slots[val].dep; 147 return &q->dep[val - SFQ_MAX_FLOWS]; 148 } 149 150 static unsigned int sfq_hash(const struct sfq_sched_data *q, 151 const struct sk_buff *skb) 152 { 153 return skb_get_hash_perturb(skb, &q->perturbation) & (q->divisor - 1); 154 } 155 156 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, 157 int *qerr) 158 { 159 struct sfq_sched_data *q = qdisc_priv(sch); 160 struct tcf_result res; 161 struct tcf_proto *fl; 162 int result; 163 164 if (TC_H_MAJ(skb->priority) == sch->handle && 165 TC_H_MIN(skb->priority) > 0 && 166 TC_H_MIN(skb->priority) <= q->divisor) 167 return TC_H_MIN(skb->priority); 168 169 fl = rcu_dereference_bh(q->filter_list); 170 if (!fl) 171 return sfq_hash(q, skb) + 1; 172 173 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; 174 result = tcf_classify(skb, NULL, fl, &res, false); 175 if (result >= 0) { 176 #ifdef CONFIG_NET_CLS_ACT 177 switch (result) { 178 case TC_ACT_STOLEN: 179 case TC_ACT_QUEUED: 180 case TC_ACT_TRAP: 181 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; 182 fallthrough; 183 case TC_ACT_SHOT: 184 return 0; 185 } 186 #endif 187 if (TC_H_MIN(res.classid) <= q->divisor) 188 return TC_H_MIN(res.classid); 189 } 190 return 0; 191 } 192 193 /* 194 * x : slot number [0 .. SFQ_MAX_FLOWS - 1] 195 */ 196 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x) 197 { 198 sfq_index p, n; 199 struct sfq_slot *slot = &q->slots[x]; 200 int qlen = slot->qlen; 201 202 p = qlen + SFQ_MAX_FLOWS; 203 n = q->dep[qlen].next; 204 205 slot->dep.next = n; 206 slot->dep.prev = p; 207 208 q->dep[qlen].next = x; /* sfq_dep_head(q, p)->next = x */ 209 sfq_dep_head(q, n)->prev = x; 210 } 211 212 #define sfq_unlink(q, x, n, p) \ 213 do { \ 214 n = q->slots[x].dep.next; \ 215 p = q->slots[x].dep.prev; \ 216 sfq_dep_head(q, p)->next = n; \ 217 sfq_dep_head(q, n)->prev = p; \ 218 } while (0) 219 220 221 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x) 222 { 223 sfq_index p, n; 224 int d; 225 226 sfq_unlink(q, x, n, p); 227 228 d = q->slots[x].qlen; 229 WRITE_ONCE(q->slots[x].qlen, d - 1); 230 if (n == p && q->cur_depth == d) 231 q->cur_depth--; 232 sfq_link(q, x); 233 } 234 235 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x) 236 { 237 sfq_index p, n; 238 int d; 239 240 sfq_unlink(q, x, n, p); 241 242 d = q->slots[x].qlen + 1; 243 WRITE_ONCE(q->slots[x].qlen, d); 244 if (q->cur_depth < d) 245 q->cur_depth = d; 246 sfq_link(q, x); 247 } 248 249 /* helper functions : might be changed when/if skb use a standard list_head */ 250 251 /* remove one skb from tail of slot queue */ 252 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot) 253 { 254 struct sk_buff *skb = slot->skblist_prev; 255 256 slot->skblist_prev = skb->prev; 257 skb->prev->next = (struct sk_buff *)slot; 258 skb->next = skb->prev = NULL; 259 return skb; 260 } 261 262 /* remove one skb from head of slot queue */ 263 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot) 264 { 265 struct sk_buff *skb = slot->skblist_next; 266 267 slot->skblist_next = skb->next; 268 skb->next->prev = (struct sk_buff *)slot; 269 skb->next = skb->prev = NULL; 270 return skb; 271 } 272 273 static inline void slot_queue_init(struct sfq_slot *slot) 274 { 275 memset(slot, 0, sizeof(*slot)); 276 slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot; 277 } 278 279 /* add skb to slot queue (tail add) */ 280 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb) 281 { 282 skb->prev = slot->skblist_prev; 283 skb->next = (struct sk_buff *)slot; 284 slot->skblist_prev->next = skb; 285 slot->skblist_prev = skb; 286 } 287 288 static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free) 289 { 290 struct sfq_sched_data *q = qdisc_priv(sch); 291 sfq_index x, d = q->cur_depth; 292 struct sk_buff *skb; 293 unsigned int len; 294 struct sfq_slot *slot; 295 296 /* Queue is full! Find the longest slot and drop tail packet from it */ 297 if (d > 1) { 298 x = q->dep[d].next; 299 slot = &q->slots[x]; 300 drop: 301 skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot); 302 len = qdisc_pkt_len(skb); 303 WRITE_ONCE(slot->backlog, slot->backlog - len); 304 sfq_dec(q, x); 305 sch->q.qlen--; 306 qdisc_qstats_backlog_dec(sch, skb); 307 qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_OVERLIMIT); 308 return len; 309 } 310 311 if (d == 1) { 312 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */ 313 x = q->tail->next; 314 slot = &q->slots[x]; 315 if (slot->next == x) 316 q->tail = NULL; /* no more active slots */ 317 else 318 q->tail->next = slot->next; 319 WRITE_ONCE(q->ht[slot->hash], SFQ_EMPTY_SLOT); 320 goto drop; 321 } 322 323 return 0; 324 } 325 326 /* Is ECN parameter configured */ 327 static int sfq_prob_mark(const struct sfq_sched_data *q) 328 { 329 return q->flags & TC_RED_ECN; 330 } 331 332 /* Should packets over max threshold just be marked */ 333 static int sfq_hard_mark(const struct sfq_sched_data *q) 334 { 335 return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN; 336 } 337 338 static int sfq_headdrop(const struct sfq_sched_data *q) 339 { 340 return q->headdrop; 341 } 342 343 static int 344 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) 345 { 346 struct sfq_sched_data *q = qdisc_priv(sch); 347 unsigned int hash, dropped; 348 sfq_index x, qlen; 349 struct sfq_slot *slot; 350 int ret; 351 struct sk_buff *head; 352 int delta; 353 354 hash = sfq_classify(skb, sch, &ret); 355 if (hash == 0) { 356 if (ret & __NET_XMIT_BYPASS) 357 qdisc_qstats_drop(sch); 358 __qdisc_drop(skb, to_free); 359 return ret; 360 } 361 hash--; 362 363 x = q->ht[hash]; 364 slot = &q->slots[x]; 365 if (x == SFQ_EMPTY_SLOT) { 366 x = q->dep[0].next; /* get a free slot */ 367 if (x >= SFQ_MAX_FLOWS) 368 return qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_MAXFLOWS); 369 WRITE_ONCE(q->ht[hash], x); 370 slot = &q->slots[x]; 371 slot->hash = hash; 372 WRITE_ONCE(slot->backlog, 0); /* should already be 0 anyway... */ 373 red_set_vars(&slot->vars); 374 goto enqueue; 375 } 376 if (q->red_parms) { 377 slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms, 378 &slot->vars, 379 slot->backlog); 380 switch (red_action(q->red_parms, 381 &slot->vars, 382 slot->vars.qavg)) { 383 case RED_DONT_MARK: 384 break; 385 386 case RED_PROB_MARK: 387 qdisc_qstats_overlimit(sch); 388 if (sfq_prob_mark(q)) { 389 /* We know we have at least one packet in queue */ 390 if (sfq_headdrop(q) && 391 INET_ECN_set_ce(slot->skblist_next)) { 392 q->stats.prob_mark_head++; 393 break; 394 } 395 if (INET_ECN_set_ce(skb)) { 396 q->stats.prob_mark++; 397 break; 398 } 399 } 400 q->stats.prob_drop++; 401 goto congestion_drop; 402 403 case RED_HARD_MARK: 404 qdisc_qstats_overlimit(sch); 405 if (sfq_hard_mark(q)) { 406 /* We know we have at least one packet in queue */ 407 if (sfq_headdrop(q) && 408 INET_ECN_set_ce(slot->skblist_next)) { 409 q->stats.forced_mark_head++; 410 break; 411 } 412 if (INET_ECN_set_ce(skb)) { 413 q->stats.forced_mark++; 414 break; 415 } 416 } 417 q->stats.forced_drop++; 418 goto congestion_drop; 419 } 420 } 421 422 if (slot->qlen >= q->maxdepth) { 423 congestion_drop: 424 if (!sfq_headdrop(q)) 425 return qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_FLOW_LIMIT); 426 427 /* We know we have at least one packet in queue */ 428 head = slot_dequeue_head(slot); 429 delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb); 430 sch->qstats.backlog -= delta; 431 WRITE_ONCE(slot->backlog, slot->backlog - delta); 432 qdisc_drop_reason(head, sch, to_free, QDISC_DROP_FLOW_LIMIT); 433 434 slot_queue_add(slot, skb); 435 qdisc_tree_reduce_backlog(sch, 0, delta); 436 return NET_XMIT_CN; 437 } 438 439 enqueue: 440 qdisc_qstats_backlog_inc(sch, skb); 441 WRITE_ONCE(slot->backlog, slot->backlog + qdisc_pkt_len(skb)); 442 slot_queue_add(slot, skb); 443 sfq_inc(q, x); 444 if (slot->qlen == 1) { /* The flow is new */ 445 if (q->tail == NULL) { /* It is the first flow */ 446 slot->next = x; 447 } else { 448 slot->next = q->tail->next; 449 q->tail->next = x; 450 } 451 /* We put this flow at the end of our flow list. 452 * This might sound unfair for a new flow to wait after old ones, 453 * but we could endup servicing new flows only, and freeze old ones. 454 */ 455 q->tail = slot; 456 /* We could use a bigger initial quantum for new flows */ 457 WRITE_ONCE(slot->allot, q->quantum); 458 } 459 if (++sch->q.qlen <= q->limit) 460 return NET_XMIT_SUCCESS; 461 462 qlen = slot->qlen; 463 dropped = sfq_drop(sch, to_free); 464 /* Return Congestion Notification only if we dropped a packet 465 * from this flow. 466 */ 467 if (qlen != slot->qlen) { 468 qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb)); 469 return NET_XMIT_CN; 470 } 471 472 /* As we dropped a packet, better let upper stack know this */ 473 qdisc_tree_reduce_backlog(sch, 1, dropped); 474 return NET_XMIT_SUCCESS; 475 } 476 477 static struct sk_buff * 478 sfq_dequeue(struct Qdisc *sch) 479 { 480 struct sfq_sched_data *q = qdisc_priv(sch); 481 struct sk_buff *skb; 482 sfq_index a, next_a; 483 struct sfq_slot *slot; 484 485 /* No active slots */ 486 if (q->tail == NULL) 487 return NULL; 488 489 next_slot: 490 a = q->tail->next; 491 slot = &q->slots[a]; 492 if (slot->allot <= 0) { 493 q->tail = slot; 494 WRITE_ONCE(slot->allot, slot->allot + q->quantum); 495 goto next_slot; 496 } 497 skb = slot_dequeue_head(slot); 498 sfq_dec(q, a); 499 qdisc_bstats_update(sch, skb); 500 sch->q.qlen--; 501 qdisc_qstats_backlog_dec(sch, skb); 502 WRITE_ONCE(slot->backlog, slot->backlog - qdisc_pkt_len(skb)); 503 /* Is the slot empty? */ 504 if (slot->qlen == 0) { 505 WRITE_ONCE(q->ht[slot->hash], SFQ_EMPTY_SLOT); 506 next_a = slot->next; 507 if (a == next_a) { 508 q->tail = NULL; /* no more active slots */ 509 return skb; 510 } 511 q->tail->next = next_a; 512 } else { 513 WRITE_ONCE(slot->allot, slot->allot - qdisc_pkt_len(skb)); 514 } 515 return skb; 516 } 517 518 static void 519 sfq_reset(struct Qdisc *sch) 520 { 521 struct sk_buff *skb; 522 523 while ((skb = sfq_dequeue(sch)) != NULL) 524 rtnl_kfree_skbs(skb, skb); 525 } 526 527 /* 528 * When q->perturbation is changed, we rehash all queued skbs 529 * to avoid OOO (Out Of Order) effects. 530 * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change 531 * counters. 532 */ 533 static void sfq_rehash(struct Qdisc *sch) 534 { 535 struct sfq_sched_data *q = qdisc_priv(sch); 536 struct sk_buff *skb; 537 int i; 538 struct sfq_slot *slot; 539 struct sk_buff_head list; 540 int dropped = 0; 541 unsigned int drop_len = 0; 542 543 __skb_queue_head_init(&list); 544 545 for (i = 0; i < q->maxflows; i++) { 546 slot = &q->slots[i]; 547 if (!slot->qlen) 548 continue; 549 while (slot->qlen) { 550 skb = slot_dequeue_head(slot); 551 sfq_dec(q, i); 552 __skb_queue_tail(&list, skb); 553 } 554 WRITE_ONCE(slot->backlog, 0); 555 red_set_vars(&slot->vars); 556 WRITE_ONCE(q->ht[slot->hash], SFQ_EMPTY_SLOT); 557 } 558 q->tail = NULL; 559 560 while ((skb = __skb_dequeue(&list)) != NULL) { 561 unsigned int hash = sfq_hash(q, skb); 562 sfq_index x = q->ht[hash]; 563 564 slot = &q->slots[x]; 565 if (x == SFQ_EMPTY_SLOT) { 566 x = q->dep[0].next; /* get a free slot */ 567 if (x >= SFQ_MAX_FLOWS) { 568 drop: 569 qdisc_qstats_backlog_dec(sch, skb); 570 drop_len += qdisc_pkt_len(skb); 571 kfree_skb(skb); 572 dropped++; 573 continue; 574 } 575 WRITE_ONCE(q->ht[hash], x); 576 slot = &q->slots[x]; 577 slot->hash = hash; 578 } 579 if (slot->qlen >= q->maxdepth) 580 goto drop; 581 slot_queue_add(slot, skb); 582 if (q->red_parms) 583 slot->vars.qavg = red_calc_qavg(q->red_parms, 584 &slot->vars, 585 slot->backlog); 586 WRITE_ONCE(slot->backlog, slot->backlog + qdisc_pkt_len(skb)); 587 sfq_inc(q, x); 588 if (slot->qlen == 1) { /* The flow is new */ 589 if (q->tail == NULL) { /* It is the first flow */ 590 slot->next = x; 591 } else { 592 slot->next = q->tail->next; 593 q->tail->next = x; 594 } 595 q->tail = slot; 596 WRITE_ONCE(slot->allot, q->quantum); 597 } 598 } 599 sch->q.qlen -= dropped; 600 qdisc_tree_reduce_backlog(sch, dropped, drop_len); 601 } 602 603 static void sfq_perturbation(struct timer_list *t) 604 { 605 struct sfq_sched_data *q = timer_container_of(q, t, perturb_timer); 606 struct Qdisc *sch = q->sch; 607 spinlock_t *root_lock; 608 siphash_key_t nkey; 609 int period; 610 611 get_random_bytes(&nkey, sizeof(nkey)); 612 rcu_read_lock(); 613 root_lock = qdisc_lock(qdisc_root_sleeping(sch)); 614 spin_lock(root_lock); 615 q->perturbation = nkey; 616 if (!q->filter_list && q->tail) 617 sfq_rehash(sch); 618 spin_unlock(root_lock); 619 620 /* q->perturb_period can change under us from 621 * sfq_change() and sfq_destroy(). 622 */ 623 period = READ_ONCE(q->perturb_period); 624 if (period) 625 mod_timer(&q->perturb_timer, jiffies + period); 626 rcu_read_unlock(); 627 } 628 629 static int sfq_change(struct Qdisc *sch, struct nlattr *opt, 630 struct netlink_ext_ack *extack) 631 { 632 struct sfq_sched_data *q = qdisc_priv(sch); 633 struct tc_sfq_qopt *ctl = nla_data(opt); 634 struct tc_sfq_qopt_v1 *ctl_v1 = NULL; 635 unsigned int qlen, dropped = 0; 636 struct red_parms *p = NULL; 637 struct sk_buff *to_free = NULL; 638 struct sk_buff *tail = NULL; 639 unsigned int maxflows; 640 unsigned int quantum; 641 unsigned int divisor; 642 int perturb_period; 643 u8 headdrop; 644 u8 maxdepth; 645 int limit; 646 u8 flags; 647 648 649 if (opt->nla_len < nla_attr_size(sizeof(*ctl))) 650 return -EINVAL; 651 if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1))) 652 ctl_v1 = nla_data(opt); 653 if (ctl->divisor && 654 (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536)) 655 return -EINVAL; 656 657 if ((int)ctl->quantum < 0) { 658 NL_SET_ERR_MSG_MOD(extack, "invalid quantum"); 659 return -EINVAL; 660 } 661 662 if (ctl->perturb_period < 0 || 663 ctl->perturb_period > INT_MAX / HZ) { 664 NL_SET_ERR_MSG_MOD(extack, "invalid perturb period"); 665 return -EINVAL; 666 } 667 perturb_period = ctl->perturb_period * HZ; 668 669 if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max, 670 ctl_v1->Wlog, ctl_v1->Scell_log, NULL)) 671 return -EINVAL; 672 if (ctl_v1 && ctl_v1->qth_min) { 673 p = kmalloc_obj(*p); 674 if (!p) 675 return -ENOMEM; 676 } 677 678 sch_tree_lock(sch); 679 680 limit = q->limit; 681 divisor = q->divisor; 682 headdrop = q->headdrop; 683 maxdepth = q->maxdepth; 684 maxflows = q->maxflows; 685 quantum = q->quantum; 686 flags = q->flags; 687 688 /* update and validate configuration */ 689 if (ctl->quantum) 690 quantum = ctl->quantum; 691 if (ctl->flows) 692 maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS); 693 if (ctl->divisor) { 694 divisor = ctl->divisor; 695 maxflows = min_t(u32, maxflows, divisor); 696 } 697 if (ctl_v1) { 698 if (ctl_v1->depth) 699 maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH); 700 if (p) { 701 red_set_parms(p, 702 ctl_v1->qth_min, ctl_v1->qth_max, 703 ctl_v1->Wlog, 704 ctl_v1->Plog, ctl_v1->Scell_log, 705 NULL, 706 ctl_v1->max_P); 707 } 708 flags = ctl_v1->flags; 709 headdrop = ctl_v1->headdrop; 710 } 711 if (ctl->limit) { 712 limit = min_t(u32, ctl->limit, maxdepth * maxflows); 713 maxflows = min_t(u32, maxflows, limit); 714 } 715 if (limit == 1) { 716 sch_tree_unlock(sch); 717 kfree(p); 718 NL_SET_ERR_MSG_MOD(extack, "invalid limit"); 719 return -EINVAL; 720 } 721 722 /* commit configuration */ 723 q->limit = limit; 724 q->divisor = divisor; 725 q->headdrop = headdrop; 726 q->maxdepth = maxdepth; 727 q->maxflows = maxflows; 728 WRITE_ONCE(q->perturb_period, perturb_period); 729 q->quantum = quantum; 730 q->flags = flags; 731 if (p) 732 swap(q->red_parms, p); 733 734 qlen = sch->q.qlen; 735 while (sch->q.qlen > q->limit) { 736 dropped += sfq_drop(sch, &to_free); 737 if (!tail) 738 tail = to_free; 739 } 740 741 rtnl_kfree_skbs(to_free, tail); 742 qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); 743 744 timer_delete(&q->perturb_timer); 745 if (q->perturb_period) { 746 mod_timer(&q->perturb_timer, jiffies + q->perturb_period); 747 get_random_bytes(&q->perturbation, sizeof(q->perturbation)); 748 } 749 sch_tree_unlock(sch); 750 kfree(p); 751 return 0; 752 } 753 754 static void *sfq_alloc(size_t sz) 755 { 756 return kvmalloc(sz, GFP_KERNEL); 757 } 758 759 static void sfq_free(void *addr) 760 { 761 kvfree(addr); 762 } 763 764 static void sfq_destroy(struct Qdisc *sch) 765 { 766 struct sfq_sched_data *q = qdisc_priv(sch); 767 768 tcf_block_put(q->block); 769 WRITE_ONCE(q->perturb_period, 0); 770 timer_delete_sync(&q->perturb_timer); 771 sfq_free(q->ht); 772 sfq_free(q->slots); 773 kfree(q->red_parms); 774 } 775 776 static int sfq_init(struct Qdisc *sch, struct nlattr *opt, 777 struct netlink_ext_ack *extack) 778 { 779 struct sfq_sched_data *q = qdisc_priv(sch); 780 int i; 781 int err; 782 783 q->sch = sch; 784 timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE); 785 786 err = tcf_block_get(&q->block, &q->filter_list, sch, extack); 787 if (err) 788 return err; 789 790 for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) { 791 q->dep[i].next = i + SFQ_MAX_FLOWS; 792 q->dep[i].prev = i + SFQ_MAX_FLOWS; 793 } 794 795 q->limit = SFQ_MAX_DEPTH; 796 q->maxdepth = SFQ_MAX_DEPTH; 797 q->cur_depth = 0; 798 q->tail = NULL; 799 q->divisor = SFQ_DEFAULT_HASH_DIVISOR; 800 q->maxflows = SFQ_DEFAULT_FLOWS; 801 q->quantum = psched_mtu(qdisc_dev(sch)); 802 q->perturb_period = 0; 803 get_random_bytes(&q->perturbation, sizeof(q->perturbation)); 804 805 if (opt) { 806 int err = sfq_change(sch, opt, extack); 807 if (err) 808 return err; 809 } 810 811 q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor); 812 q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows); 813 if (!q->ht || !q->slots) { 814 /* Note: sfq_destroy() will be called by our caller */ 815 return -ENOMEM; 816 } 817 818 for (i = 0; i < q->divisor; i++) 819 q->ht[i] = SFQ_EMPTY_SLOT; 820 821 for (i = 0; i < q->maxflows; i++) { 822 slot_queue_init(&q->slots[i]); 823 sfq_link(q, i); 824 } 825 if (q->limit >= 1) 826 sch->flags |= TCQ_F_CAN_BYPASS; 827 else 828 sch->flags &= ~TCQ_F_CAN_BYPASS; 829 return 0; 830 } 831 832 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) 833 { 834 struct sfq_sched_data *q = qdisc_priv(sch); 835 unsigned char *b = skb_tail_pointer(skb); 836 struct tc_sfq_qopt_v1 opt; 837 struct red_parms *p = q->red_parms; 838 839 memset(&opt, 0, sizeof(opt)); 840 opt.v0.quantum = q->quantum; 841 opt.v0.perturb_period = q->perturb_period / HZ; 842 opt.v0.limit = q->limit; 843 opt.v0.divisor = q->divisor; 844 opt.v0.flows = q->maxflows; 845 opt.depth = q->maxdepth; 846 opt.headdrop = q->headdrop; 847 848 if (p) { 849 opt.qth_min = p->qth_min >> p->Wlog; 850 opt.qth_max = p->qth_max >> p->Wlog; 851 opt.Wlog = p->Wlog; 852 opt.Plog = p->Plog; 853 opt.Scell_log = p->Scell_log; 854 opt.max_P = p->max_P; 855 } 856 memcpy(&opt.stats, &q->stats, sizeof(opt.stats)); 857 opt.flags = q->flags; 858 859 if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) 860 goto nla_put_failure; 861 862 return skb->len; 863 864 nla_put_failure: 865 nlmsg_trim(skb, b); 866 return -1; 867 } 868 869 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg) 870 { 871 return NULL; 872 } 873 874 static unsigned long sfq_find(struct Qdisc *sch, u32 classid) 875 { 876 return 0; 877 } 878 879 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent, 880 u32 classid) 881 { 882 return 0; 883 } 884 885 static void sfq_unbind(struct Qdisc *q, unsigned long cl) 886 { 887 } 888 889 static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl, 890 struct netlink_ext_ack *extack) 891 { 892 struct sfq_sched_data *q = qdisc_priv(sch); 893 894 if (cl) 895 return NULL; 896 return q->block; 897 } 898 899 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl, 900 struct sk_buff *skb, struct tcmsg *tcm) 901 { 902 tcm->tcm_handle |= TC_H_MIN(cl); 903 return 0; 904 } 905 906 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl, 907 struct gnet_dump *d) 908 { 909 struct sfq_sched_data *q = qdisc_priv(sch); 910 sfq_index idx = READ_ONCE(q->ht[cl - 1]); 911 struct gnet_stats_queue qs = { 0 }; 912 struct tc_sfq_xstats xstats = { 0 }; 913 914 if (idx != SFQ_EMPTY_SLOT) { 915 const struct sfq_slot *slot = &q->slots[idx]; 916 917 xstats.allot = READ_ONCE(slot->allot); 918 qs.qlen = READ_ONCE(slot->qlen); 919 qs.backlog = READ_ONCE(slot->backlog); 920 } 921 if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) 922 return -1; 923 return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); 924 } 925 926 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg) 927 { 928 struct sfq_sched_data *q = qdisc_priv(sch); 929 unsigned int i; 930 931 if (arg->stop) 932 return; 933 934 for (i = 0; i < q->divisor; i++) { 935 if (READ_ONCE(q->ht[i]) == SFQ_EMPTY_SLOT) { 936 arg->count++; 937 continue; 938 } 939 if (!tc_qdisc_stats_dump(sch, i + 1, arg)) 940 break; 941 } 942 } 943 944 static const struct Qdisc_class_ops sfq_class_ops = { 945 .leaf = sfq_leaf, 946 .find = sfq_find, 947 .tcf_block = sfq_tcf_block, 948 .bind_tcf = sfq_bind, 949 .unbind_tcf = sfq_unbind, 950 .dump = sfq_dump_class, 951 .dump_stats = sfq_dump_class_stats, 952 .walk = sfq_walk, 953 }; 954 955 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = { 956 .cl_ops = &sfq_class_ops, 957 .id = "sfq", 958 .priv_size = sizeof(struct sfq_sched_data), 959 .enqueue = sfq_enqueue, 960 .dequeue = sfq_dequeue, 961 .peek = qdisc_peek_dequeued, 962 .init = sfq_init, 963 .reset = sfq_reset, 964 .destroy = sfq_destroy, 965 .change = NULL, 966 .dump = sfq_dump, 967 .owner = THIS_MODULE, 968 }; 969 MODULE_ALIAS_NET_SCH("sfq"); 970 971 static int __init sfq_module_init(void) 972 { 973 return register_qdisc(&sfq_qdisc_ops); 974 } 975 static void __exit sfq_module_exit(void) 976 { 977 unregister_qdisc(&sfq_qdisc_ops); 978 } 979 module_init(sfq_module_init) 980 module_exit(sfq_module_exit) 981 MODULE_LICENSE("GPL"); 982 MODULE_DESCRIPTION("Stochastic Fairness qdisc"); 983