1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * net/sched/sch_sfq.c Stochastic Fairness Queueing discipline. 4 * 5 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 6 */ 7 8 #include <linux/module.h> 9 #include <linux/types.h> 10 #include <linux/kernel.h> 11 #include <linux/jiffies.h> 12 #include <linux/string.h> 13 #include <linux/in.h> 14 #include <linux/errno.h> 15 #include <linux/init.h> 16 #include <linux/skbuff.h> 17 #include <linux/siphash.h> 18 #include <linux/slab.h> 19 #include <linux/vmalloc.h> 20 #include <net/netlink.h> 21 #include <net/pkt_sched.h> 22 #include <net/pkt_cls.h> 23 #include <net/red.h> 24 25 26 /* Stochastic Fairness Queuing algorithm. 27 ======================================= 28 29 Source: 30 Paul E. McKenney "Stochastic Fairness Queuing", 31 IEEE INFOCOMM'90 Proceedings, San Francisco, 1990. 32 33 Paul E. McKenney "Stochastic Fairness Queuing", 34 "Interworking: Research and Experience", v.2, 1991, p.113-131. 35 36 37 See also: 38 M. Shreedhar and George Varghese "Efficient Fair 39 Queuing using Deficit Round Robin", Proc. SIGCOMM 95. 40 41 42 This is not the thing that is usually called (W)FQ nowadays. 43 It does not use any timestamp mechanism, but instead 44 processes queues in round-robin order. 45 46 ADVANTAGE: 47 48 - It is very cheap. Both CPU and memory requirements are minimal. 49 50 DRAWBACKS: 51 52 - "Stochastic" -> It is not 100% fair. 53 When hash collisions occur, several flows are considered as one. 54 55 - "Round-robin" -> It introduces larger delays than virtual clock 56 based schemes, and should not be used for isolating interactive 57 traffic from non-interactive. It means, that this scheduler 58 should be used as leaf of CBQ or P3, which put interactive traffic 59 to higher priority band. 60 61 We still need true WFQ for top level CSZ, but using WFQ 62 for the best effort traffic is absolutely pointless: 63 SFQ is superior for this purpose. 64 65 IMPLEMENTATION: 66 This implementation limits : 67 - maximal queue length per flow to 127 packets. 68 - max mtu to 2^18-1; 69 - max 65408 flows, 70 - number of hash buckets to 65536. 71 72 It is easy to increase these values, but not in flight. */ 73 74 #define SFQ_MAX_DEPTH 127 /* max number of packets per flow */ 75 #define SFQ_DEFAULT_FLOWS 128 76 #define SFQ_MAX_FLOWS (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */ 77 #define SFQ_EMPTY_SLOT 0xffff 78 #define SFQ_DEFAULT_HASH_DIVISOR 1024 79 80 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */ 81 typedef u16 sfq_index; 82 83 /* 84 * We dont use pointers to save space. 85 * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array 86 * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH] 87 * are 'pointers' to dep[] array 88 */ 89 struct sfq_head { 90 sfq_index next; 91 sfq_index prev; 92 }; 93 94 struct sfq_slot { 95 struct sk_buff *skblist_next; 96 struct sk_buff *skblist_prev; 97 sfq_index qlen; /* number of skbs in skblist */ 98 sfq_index next; /* next slot in sfq RR chain */ 99 struct sfq_head dep; /* anchor in dep[] chains */ 100 unsigned short hash; /* hash value (index in ht[]) */ 101 int allot; /* credit for this slot */ 102 103 unsigned int backlog; 104 struct red_vars vars; 105 }; 106 107 struct sfq_sched_data { 108 /* frequently used fields */ 109 int limit; /* limit of total number of packets in this qdisc */ 110 unsigned int divisor; /* number of slots in hash table */ 111 u8 headdrop; 112 u8 maxdepth; /* limit of packets per flow */ 113 114 siphash_key_t perturbation; 115 u8 cur_depth; /* depth of longest slot */ 116 u8 flags; 117 struct tcf_proto __rcu *filter_list; 118 struct tcf_block *block; 119 sfq_index *ht; /* Hash table ('divisor' slots) */ 120 struct sfq_slot *slots; /* Flows table ('maxflows' entries) */ 121 122 struct red_parms *red_parms; 123 struct tc_sfqred_stats stats; 124 struct sfq_slot *tail; /* current slot in round */ 125 126 struct sfq_head dep[SFQ_MAX_DEPTH + 1]; 127 /* Linked lists of slots, indexed by depth 128 * dep[0] : list of unused flows 129 * dep[1] : list of flows with 1 packet 130 * dep[X] : list of flows with X packets 131 */ 132 133 unsigned int maxflows; /* number of flows in flows array */ 134 int perturb_period; 135 unsigned int quantum; /* Allotment per round: MUST BE >= MTU */ 136 struct timer_list perturb_timer; 137 struct Qdisc *sch; 138 }; 139 140 /* 141 * sfq_head are either in a sfq_slot or in dep[] array 142 */ 143 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val) 144 { 145 if (val < SFQ_MAX_FLOWS) 146 return &q->slots[val].dep; 147 return &q->dep[val - SFQ_MAX_FLOWS]; 148 } 149 150 static unsigned int sfq_hash(const struct sfq_sched_data *q, 151 const struct sk_buff *skb) 152 { 153 return skb_get_hash_perturb(skb, &q->perturbation) & (q->divisor - 1); 154 } 155 156 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, 157 int *qerr) 158 { 159 struct sfq_sched_data *q = qdisc_priv(sch); 160 struct tcf_result res; 161 struct tcf_proto *fl; 162 int result; 163 164 if (TC_H_MAJ(skb->priority) == sch->handle && 165 TC_H_MIN(skb->priority) > 0 && 166 TC_H_MIN(skb->priority) <= q->divisor) 167 return TC_H_MIN(skb->priority); 168 169 fl = rcu_dereference_bh(q->filter_list); 170 if (!fl) 171 return sfq_hash(q, skb) + 1; 172 173 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; 174 result = tcf_classify(skb, NULL, fl, &res, false); 175 if (result >= 0) { 176 #ifdef CONFIG_NET_CLS_ACT 177 switch (result) { 178 case TC_ACT_STOLEN: 179 case TC_ACT_QUEUED: 180 case TC_ACT_TRAP: 181 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; 182 fallthrough; 183 case TC_ACT_SHOT: 184 return 0; 185 } 186 #endif 187 if (TC_H_MIN(res.classid) <= q->divisor) 188 return TC_H_MIN(res.classid); 189 } 190 return 0; 191 } 192 193 /* 194 * x : slot number [0 .. SFQ_MAX_FLOWS - 1] 195 */ 196 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x) 197 { 198 sfq_index p, n; 199 struct sfq_slot *slot = &q->slots[x]; 200 int qlen = slot->qlen; 201 202 p = qlen + SFQ_MAX_FLOWS; 203 n = q->dep[qlen].next; 204 205 slot->dep.next = n; 206 slot->dep.prev = p; 207 208 q->dep[qlen].next = x; /* sfq_dep_head(q, p)->next = x */ 209 sfq_dep_head(q, n)->prev = x; 210 } 211 212 #define sfq_unlink(q, x, n, p) \ 213 do { \ 214 n = q->slots[x].dep.next; \ 215 p = q->slots[x].dep.prev; \ 216 sfq_dep_head(q, p)->next = n; \ 217 sfq_dep_head(q, n)->prev = p; \ 218 } while (0) 219 220 221 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x) 222 { 223 sfq_index p, n; 224 int d; 225 226 sfq_unlink(q, x, n, p); 227 228 d = q->slots[x].qlen; 229 WRITE_ONCE(q->slots[x].qlen, d - 1); 230 if (n == p && q->cur_depth == d) 231 q->cur_depth--; 232 sfq_link(q, x); 233 } 234 235 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x) 236 { 237 sfq_index p, n; 238 int d; 239 240 sfq_unlink(q, x, n, p); 241 242 d = q->slots[x].qlen + 1; 243 WRITE_ONCE(q->slots[x].qlen, d); 244 if (q->cur_depth < d) 245 q->cur_depth = d; 246 sfq_link(q, x); 247 } 248 249 /* helper functions : might be changed when/if skb use a standard list_head */ 250 251 /* remove one skb from tail of slot queue */ 252 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot) 253 { 254 struct sk_buff *skb = slot->skblist_prev; 255 256 slot->skblist_prev = skb->prev; 257 skb->prev->next = (struct sk_buff *)slot; 258 skb->next = skb->prev = NULL; 259 return skb; 260 } 261 262 /* remove one skb from head of slot queue */ 263 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot) 264 { 265 struct sk_buff *skb = slot->skblist_next; 266 267 slot->skblist_next = skb->next; 268 skb->next->prev = (struct sk_buff *)slot; 269 skb->next = skb->prev = NULL; 270 return skb; 271 } 272 273 static inline void slot_queue_init(struct sfq_slot *slot) 274 { 275 memset(slot, 0, sizeof(*slot)); 276 slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot; 277 } 278 279 /* add skb to slot queue (tail add) */ 280 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb) 281 { 282 skb->prev = slot->skblist_prev; 283 skb->next = (struct sk_buff *)slot; 284 slot->skblist_prev->next = skb; 285 slot->skblist_prev = skb; 286 } 287 288 static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free) 289 { 290 struct sfq_sched_data *q = qdisc_priv(sch); 291 sfq_index x, d = q->cur_depth; 292 struct sk_buff *skb; 293 unsigned int len; 294 struct sfq_slot *slot; 295 296 /* Queue is full! Find the longest slot and drop tail packet from it */ 297 if (d > 1) { 298 x = q->dep[d].next; 299 slot = &q->slots[x]; 300 drop: 301 skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot); 302 len = qdisc_pkt_len(skb); 303 WRITE_ONCE(slot->backlog, slot->backlog - len); 304 sfq_dec(q, x); 305 qdisc_qlen_dec(sch); 306 qdisc_qstats_backlog_dec(sch, skb); 307 qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_OVERLIMIT); 308 return len; 309 } 310 311 if (d == 1) { 312 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */ 313 x = q->tail->next; 314 slot = &q->slots[x]; 315 if (slot->next == x) 316 q->tail = NULL; /* no more active slots */ 317 else 318 q->tail->next = slot->next; 319 WRITE_ONCE(q->ht[slot->hash], SFQ_EMPTY_SLOT); 320 goto drop; 321 } 322 323 return 0; 324 } 325 326 /* Is ECN parameter configured */ 327 static int sfq_prob_mark(const struct sfq_sched_data *q) 328 { 329 return q->flags & TC_RED_ECN; 330 } 331 332 /* Should packets over max threshold just be marked */ 333 static int sfq_hard_mark(const struct sfq_sched_data *q) 334 { 335 return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN; 336 } 337 338 static int sfq_headdrop(const struct sfq_sched_data *q) 339 { 340 return q->headdrop; 341 } 342 343 static int 344 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) 345 { 346 struct sfq_sched_data *q = qdisc_priv(sch); 347 unsigned int hash, dropped; 348 sfq_index x, qlen; 349 struct sfq_slot *slot; 350 int ret; 351 struct sk_buff *head; 352 int delta; 353 354 hash = sfq_classify(skb, sch, &ret); 355 if (hash == 0) { 356 if (ret & __NET_XMIT_BYPASS) 357 qdisc_qstats_drop(sch); 358 __qdisc_drop(skb, to_free); 359 return ret; 360 } 361 hash--; 362 363 x = q->ht[hash]; 364 slot = &q->slots[x]; 365 if (x == SFQ_EMPTY_SLOT) { 366 x = q->dep[0].next; /* get a free slot */ 367 if (x >= SFQ_MAX_FLOWS) 368 return qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_MAXFLOWS); 369 WRITE_ONCE(q->ht[hash], x); 370 slot = &q->slots[x]; 371 slot->hash = hash; 372 WRITE_ONCE(slot->backlog, 0); /* should already be 0 anyway... */ 373 red_set_vars(&slot->vars); 374 goto enqueue; 375 } 376 if (q->red_parms) { 377 slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms, 378 &slot->vars, 379 slot->backlog); 380 switch (red_action(q->red_parms, 381 &slot->vars, 382 slot->vars.qavg)) { 383 case RED_DONT_MARK: 384 break; 385 386 case RED_PROB_MARK: 387 qdisc_qstats_overlimit(sch); 388 if (sfq_prob_mark(q)) { 389 /* We know we have at least one packet in queue */ 390 if (sfq_headdrop(q) && 391 INET_ECN_set_ce(slot->skblist_next)) { 392 q->stats.prob_mark_head++; 393 break; 394 } 395 if (INET_ECN_set_ce(skb)) { 396 q->stats.prob_mark++; 397 break; 398 } 399 } 400 q->stats.prob_drop++; 401 goto congestion_drop; 402 403 case RED_HARD_MARK: 404 qdisc_qstats_overlimit(sch); 405 if (sfq_hard_mark(q)) { 406 /* We know we have at least one packet in queue */ 407 if (sfq_headdrop(q) && 408 INET_ECN_set_ce(slot->skblist_next)) { 409 q->stats.forced_mark_head++; 410 break; 411 } 412 if (INET_ECN_set_ce(skb)) { 413 q->stats.forced_mark++; 414 break; 415 } 416 } 417 q->stats.forced_drop++; 418 goto congestion_drop; 419 } 420 } 421 422 if (slot->qlen >= q->maxdepth) { 423 congestion_drop: 424 if (!sfq_headdrop(q)) 425 return qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_FLOW_LIMIT); 426 427 /* We know we have at least one packet in queue */ 428 head = slot_dequeue_head(slot); 429 delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb); 430 qstats_backlog_sub(sch, delta); 431 WRITE_ONCE(slot->backlog, slot->backlog - delta); 432 qdisc_drop_reason(head, sch, to_free, QDISC_DROP_FLOW_LIMIT); 433 434 slot_queue_add(slot, skb); 435 qdisc_tree_reduce_backlog(sch, 0, delta); 436 return NET_XMIT_CN; 437 } 438 439 enqueue: 440 qdisc_qstats_backlog_inc(sch, skb); 441 WRITE_ONCE(slot->backlog, slot->backlog + qdisc_pkt_len(skb)); 442 slot_queue_add(slot, skb); 443 sfq_inc(q, x); 444 if (slot->qlen == 1) { /* The flow is new */ 445 if (q->tail == NULL) { /* It is the first flow */ 446 slot->next = x; 447 } else { 448 slot->next = q->tail->next; 449 q->tail->next = x; 450 } 451 /* We put this flow at the end of our flow list. 452 * This might sound unfair for a new flow to wait after old ones, 453 * but we could endup servicing new flows only, and freeze old ones. 454 */ 455 q->tail = slot; 456 /* We could use a bigger initial quantum for new flows */ 457 WRITE_ONCE(slot->allot, q->quantum); 458 } 459 qdisc_qlen_inc(sch); 460 if (sch->q.qlen <= q->limit) 461 return NET_XMIT_SUCCESS; 462 463 qlen = slot->qlen; 464 dropped = sfq_drop(sch, to_free); 465 /* Return Congestion Notification only if we dropped a packet 466 * from this flow. 467 */ 468 if (qlen != slot->qlen) { 469 qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb)); 470 return NET_XMIT_CN; 471 } 472 473 /* As we dropped a packet, better let upper stack know this */ 474 qdisc_tree_reduce_backlog(sch, 1, dropped); 475 return NET_XMIT_SUCCESS; 476 } 477 478 static struct sk_buff * 479 sfq_dequeue(struct Qdisc *sch) 480 { 481 struct sfq_sched_data *q = qdisc_priv(sch); 482 struct sk_buff *skb; 483 sfq_index a, next_a; 484 struct sfq_slot *slot; 485 486 /* No active slots */ 487 if (q->tail == NULL) 488 return NULL; 489 490 next_slot: 491 a = q->tail->next; 492 slot = &q->slots[a]; 493 if (slot->allot <= 0) { 494 q->tail = slot; 495 WRITE_ONCE(slot->allot, slot->allot + q->quantum); 496 goto next_slot; 497 } 498 skb = slot_dequeue_head(slot); 499 sfq_dec(q, a); 500 qdisc_bstats_update(sch, skb); 501 qdisc_qlen_dec(sch); 502 qdisc_qstats_backlog_dec(sch, skb); 503 WRITE_ONCE(slot->backlog, slot->backlog - qdisc_pkt_len(skb)); 504 /* Is the slot empty? */ 505 if (slot->qlen == 0) { 506 WRITE_ONCE(q->ht[slot->hash], SFQ_EMPTY_SLOT); 507 next_a = slot->next; 508 if (a == next_a) { 509 q->tail = NULL; /* no more active slots */ 510 return skb; 511 } 512 q->tail->next = next_a; 513 } else { 514 WRITE_ONCE(slot->allot, slot->allot - qdisc_pkt_len(skb)); 515 } 516 return skb; 517 } 518 519 static void 520 sfq_reset(struct Qdisc *sch) 521 { 522 struct sk_buff *skb; 523 524 while ((skb = sfq_dequeue(sch)) != NULL) 525 rtnl_kfree_skbs(skb, skb); 526 } 527 528 /* 529 * When q->perturbation is changed, we rehash all queued skbs 530 * to avoid OOO (Out Of Order) effects. 531 * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change 532 * counters. 533 */ 534 static void sfq_rehash(struct Qdisc *sch) 535 { 536 struct sfq_sched_data *q = qdisc_priv(sch); 537 struct sk_buff *skb; 538 int i; 539 struct sfq_slot *slot; 540 struct sk_buff_head list; 541 int dropped = 0; 542 unsigned int drop_len = 0; 543 544 __skb_queue_head_init(&list); 545 546 for (i = 0; i < q->maxflows; i++) { 547 slot = &q->slots[i]; 548 if (!slot->qlen) 549 continue; 550 while (slot->qlen) { 551 skb = slot_dequeue_head(slot); 552 sfq_dec(q, i); 553 __skb_queue_tail(&list, skb); 554 } 555 WRITE_ONCE(slot->backlog, 0); 556 red_set_vars(&slot->vars); 557 WRITE_ONCE(q->ht[slot->hash], SFQ_EMPTY_SLOT); 558 } 559 q->tail = NULL; 560 561 while ((skb = __skb_dequeue(&list)) != NULL) { 562 unsigned int hash = sfq_hash(q, skb); 563 sfq_index x = q->ht[hash]; 564 565 slot = &q->slots[x]; 566 if (x == SFQ_EMPTY_SLOT) { 567 x = q->dep[0].next; /* get a free slot */ 568 if (x >= SFQ_MAX_FLOWS) { 569 drop: 570 qdisc_qstats_backlog_dec(sch, skb); 571 drop_len += qdisc_pkt_len(skb); 572 kfree_skb(skb); 573 dropped++; 574 continue; 575 } 576 WRITE_ONCE(q->ht[hash], x); 577 slot = &q->slots[x]; 578 slot->hash = hash; 579 } 580 if (slot->qlen >= q->maxdepth) 581 goto drop; 582 slot_queue_add(slot, skb); 583 if (q->red_parms) 584 slot->vars.qavg = red_calc_qavg(q->red_parms, 585 &slot->vars, 586 slot->backlog); 587 WRITE_ONCE(slot->backlog, slot->backlog + qdisc_pkt_len(skb)); 588 sfq_inc(q, x); 589 if (slot->qlen == 1) { /* The flow is new */ 590 if (q->tail == NULL) { /* It is the first flow */ 591 slot->next = x; 592 } else { 593 slot->next = q->tail->next; 594 q->tail->next = x; 595 } 596 q->tail = slot; 597 WRITE_ONCE(slot->allot, q->quantum); 598 } 599 } 600 WRITE_ONCE(sch->q.qlen, sch->q.qlen - dropped); 601 qdisc_tree_reduce_backlog(sch, dropped, drop_len); 602 } 603 604 static void sfq_perturbation(struct timer_list *t) 605 { 606 struct sfq_sched_data *q = timer_container_of(q, t, perturb_timer); 607 struct Qdisc *sch = q->sch; 608 spinlock_t *root_lock; 609 siphash_key_t nkey; 610 int period; 611 612 get_random_bytes(&nkey, sizeof(nkey)); 613 rcu_read_lock(); 614 root_lock = qdisc_lock(qdisc_root_sleeping(sch)); 615 spin_lock(root_lock); 616 q->perturbation = nkey; 617 if (!q->filter_list && q->tail) 618 sfq_rehash(sch); 619 spin_unlock(root_lock); 620 621 /* q->perturb_period can change under us from 622 * sfq_change() and sfq_destroy(). 623 */ 624 period = READ_ONCE(q->perturb_period); 625 if (period) 626 mod_timer(&q->perturb_timer, jiffies + period); 627 rcu_read_unlock(); 628 } 629 630 static int sfq_change(struct Qdisc *sch, struct nlattr *opt, 631 struct netlink_ext_ack *extack) 632 { 633 struct sfq_sched_data *q = qdisc_priv(sch); 634 struct tc_sfq_qopt *ctl = nla_data(opt); 635 struct tc_sfq_qopt_v1 *ctl_v1 = NULL; 636 unsigned int qlen, dropped = 0; 637 struct red_parms *p = NULL; 638 struct sk_buff *to_free = NULL; 639 struct sk_buff *tail = NULL; 640 unsigned int maxflows; 641 unsigned int quantum; 642 unsigned int divisor; 643 int perturb_period; 644 u8 headdrop; 645 u8 maxdepth; 646 int limit; 647 u8 flags; 648 649 650 if (opt->nla_len < nla_attr_size(sizeof(*ctl))) 651 return -EINVAL; 652 if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1))) 653 ctl_v1 = nla_data(opt); 654 if (ctl->divisor && 655 (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536)) 656 return -EINVAL; 657 658 if ((int)ctl->quantum < 0) { 659 NL_SET_ERR_MSG_MOD(extack, "invalid quantum"); 660 return -EINVAL; 661 } 662 663 if (ctl->perturb_period < 0 || 664 ctl->perturb_period > INT_MAX / HZ) { 665 NL_SET_ERR_MSG_MOD(extack, "invalid perturb period"); 666 return -EINVAL; 667 } 668 perturb_period = ctl->perturb_period * HZ; 669 670 if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max, 671 ctl_v1->Wlog, ctl_v1->Scell_log, NULL)) 672 return -EINVAL; 673 if (ctl_v1 && ctl_v1->qth_min) { 674 p = kmalloc_obj(*p); 675 if (!p) 676 return -ENOMEM; 677 } 678 679 sch_tree_lock(sch); 680 681 limit = q->limit; 682 divisor = q->divisor; 683 headdrop = q->headdrop; 684 maxdepth = q->maxdepth; 685 maxflows = q->maxflows; 686 quantum = q->quantum; 687 flags = q->flags; 688 689 /* update and validate configuration */ 690 if (ctl->quantum) 691 quantum = ctl->quantum; 692 if (ctl->flows) 693 maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS); 694 if (ctl->divisor) { 695 divisor = ctl->divisor; 696 maxflows = min_t(u32, maxflows, divisor); 697 } 698 if (ctl_v1) { 699 if (ctl_v1->depth) 700 maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH); 701 if (p) { 702 red_set_parms(p, 703 ctl_v1->qth_min, ctl_v1->qth_max, 704 ctl_v1->Wlog, 705 ctl_v1->Plog, ctl_v1->Scell_log, 706 NULL, 707 ctl_v1->max_P); 708 } 709 flags = ctl_v1->flags; 710 headdrop = ctl_v1->headdrop; 711 } 712 if (ctl->limit) { 713 limit = min_t(u32, ctl->limit, maxdepth * maxflows); 714 maxflows = min_t(u32, maxflows, limit); 715 } 716 if (limit == 1) { 717 sch_tree_unlock(sch); 718 kfree(p); 719 NL_SET_ERR_MSG_MOD(extack, "invalid limit"); 720 return -EINVAL; 721 } 722 723 /* commit configuration */ 724 q->limit = limit; 725 q->divisor = divisor; 726 q->headdrop = headdrop; 727 q->maxdepth = maxdepth; 728 q->maxflows = maxflows; 729 WRITE_ONCE(q->perturb_period, perturb_period); 730 q->quantum = quantum; 731 q->flags = flags; 732 if (p) 733 swap(q->red_parms, p); 734 735 qlen = sch->q.qlen; 736 while (sch->q.qlen > q->limit) { 737 dropped += sfq_drop(sch, &to_free); 738 if (!tail) 739 tail = to_free; 740 } 741 742 rtnl_kfree_skbs(to_free, tail); 743 qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); 744 745 timer_delete(&q->perturb_timer); 746 if (q->perturb_period) { 747 mod_timer(&q->perturb_timer, jiffies + q->perturb_period); 748 get_random_bytes(&q->perturbation, sizeof(q->perturbation)); 749 } 750 sch_tree_unlock(sch); 751 kfree(p); 752 return 0; 753 } 754 755 static void *sfq_alloc(size_t sz) 756 { 757 return kvmalloc(sz, GFP_KERNEL); 758 } 759 760 static void sfq_free(void *addr) 761 { 762 kvfree(addr); 763 } 764 765 static void sfq_destroy(struct Qdisc *sch) 766 { 767 struct sfq_sched_data *q = qdisc_priv(sch); 768 769 tcf_block_put(q->block); 770 WRITE_ONCE(q->perturb_period, 0); 771 timer_delete_sync(&q->perturb_timer); 772 sfq_free(q->ht); 773 sfq_free(q->slots); 774 kfree(q->red_parms); 775 } 776 777 static int sfq_init(struct Qdisc *sch, struct nlattr *opt, 778 struct netlink_ext_ack *extack) 779 { 780 struct sfq_sched_data *q = qdisc_priv(sch); 781 int i; 782 int err; 783 784 q->sch = sch; 785 timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE); 786 787 err = tcf_block_get(&q->block, &q->filter_list, sch, extack); 788 if (err) 789 return err; 790 791 for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) { 792 q->dep[i].next = i + SFQ_MAX_FLOWS; 793 q->dep[i].prev = i + SFQ_MAX_FLOWS; 794 } 795 796 q->limit = SFQ_MAX_DEPTH; 797 q->maxdepth = SFQ_MAX_DEPTH; 798 q->cur_depth = 0; 799 q->tail = NULL; 800 q->divisor = SFQ_DEFAULT_HASH_DIVISOR; 801 q->maxflows = SFQ_DEFAULT_FLOWS; 802 q->quantum = psched_mtu(qdisc_dev(sch)); 803 q->perturb_period = 0; 804 get_random_bytes(&q->perturbation, sizeof(q->perturbation)); 805 806 if (opt) { 807 int err = sfq_change(sch, opt, extack); 808 if (err) 809 return err; 810 } 811 812 q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor); 813 q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows); 814 if (!q->ht || !q->slots) { 815 /* Note: sfq_destroy() will be called by our caller */ 816 return -ENOMEM; 817 } 818 819 for (i = 0; i < q->divisor; i++) 820 q->ht[i] = SFQ_EMPTY_SLOT; 821 822 for (i = 0; i < q->maxflows; i++) { 823 slot_queue_init(&q->slots[i]); 824 sfq_link(q, i); 825 } 826 if (q->limit >= 1) 827 sch->flags |= TCQ_F_CAN_BYPASS; 828 else 829 sch->flags &= ~TCQ_F_CAN_BYPASS; 830 return 0; 831 } 832 833 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) 834 { 835 struct sfq_sched_data *q = qdisc_priv(sch); 836 unsigned char *b = skb_tail_pointer(skb); 837 struct tc_sfq_qopt_v1 opt; 838 struct red_parms *p = q->red_parms; 839 840 memset(&opt, 0, sizeof(opt)); 841 opt.v0.quantum = q->quantum; 842 opt.v0.perturb_period = q->perturb_period / HZ; 843 opt.v0.limit = q->limit; 844 opt.v0.divisor = q->divisor; 845 opt.v0.flows = q->maxflows; 846 opt.depth = q->maxdepth; 847 opt.headdrop = q->headdrop; 848 849 if (p) { 850 opt.qth_min = p->qth_min >> p->Wlog; 851 opt.qth_max = p->qth_max >> p->Wlog; 852 opt.Wlog = p->Wlog; 853 opt.Plog = p->Plog; 854 opt.Scell_log = p->Scell_log; 855 opt.max_P = p->max_P; 856 } 857 memcpy(&opt.stats, &q->stats, sizeof(opt.stats)); 858 opt.flags = q->flags; 859 860 if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) 861 goto nla_put_failure; 862 863 return skb->len; 864 865 nla_put_failure: 866 nlmsg_trim(skb, b); 867 return -1; 868 } 869 870 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg) 871 { 872 return NULL; 873 } 874 875 static unsigned long sfq_find(struct Qdisc *sch, u32 classid) 876 { 877 return 0; 878 } 879 880 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent, 881 u32 classid) 882 { 883 return 0; 884 } 885 886 static void sfq_unbind(struct Qdisc *q, unsigned long cl) 887 { 888 } 889 890 static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl, 891 struct netlink_ext_ack *extack) 892 { 893 struct sfq_sched_data *q = qdisc_priv(sch); 894 895 if (cl) 896 return NULL; 897 return q->block; 898 } 899 900 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl, 901 struct sk_buff *skb, struct tcmsg *tcm) 902 { 903 tcm->tcm_handle |= TC_H_MIN(cl); 904 return 0; 905 } 906 907 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl, 908 struct gnet_dump *d) 909 { 910 struct sfq_sched_data *q = qdisc_priv(sch); 911 sfq_index idx = READ_ONCE(q->ht[cl - 1]); 912 struct gnet_stats_queue qs = { 0 }; 913 struct tc_sfq_xstats xstats = { 0 }; 914 915 if (idx != SFQ_EMPTY_SLOT) { 916 const struct sfq_slot *slot = &q->slots[idx]; 917 918 xstats.allot = READ_ONCE(slot->allot); 919 qs.qlen = READ_ONCE(slot->qlen); 920 qs.backlog = READ_ONCE(slot->backlog); 921 } 922 if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) 923 return -1; 924 return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); 925 } 926 927 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg) 928 { 929 struct sfq_sched_data *q = qdisc_priv(sch); 930 unsigned int i; 931 932 if (arg->stop) 933 return; 934 935 for (i = 0; i < q->divisor; i++) { 936 if (READ_ONCE(q->ht[i]) == SFQ_EMPTY_SLOT) { 937 arg->count++; 938 continue; 939 } 940 if (!tc_qdisc_stats_dump(sch, i + 1, arg)) 941 break; 942 } 943 } 944 945 static const struct Qdisc_class_ops sfq_class_ops = { 946 .leaf = sfq_leaf, 947 .find = sfq_find, 948 .tcf_block = sfq_tcf_block, 949 .bind_tcf = sfq_bind, 950 .unbind_tcf = sfq_unbind, 951 .dump = sfq_dump_class, 952 .dump_stats = sfq_dump_class_stats, 953 .walk = sfq_walk, 954 }; 955 956 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = { 957 .cl_ops = &sfq_class_ops, 958 .id = "sfq", 959 .priv_size = sizeof(struct sfq_sched_data), 960 .enqueue = sfq_enqueue, 961 .dequeue = sfq_dequeue, 962 .peek = qdisc_peek_dequeued, 963 .init = sfq_init, 964 .reset = sfq_reset, 965 .destroy = sfq_destroy, 966 .change = NULL, 967 .dump = sfq_dump, 968 .owner = THIS_MODULE, 969 }; 970 MODULE_ALIAS_NET_SCH("sfq"); 971 972 static int __init sfq_module_init(void) 973 { 974 return register_qdisc(&sfq_qdisc_ops); 975 } 976 static void __exit sfq_module_exit(void) 977 { 978 unregister_qdisc(&sfq_qdisc_ops); 979 } 980 module_init(sfq_module_init) 981 module_exit(sfq_module_exit) 982 MODULE_LICENSE("GPL"); 983 MODULE_DESCRIPTION("Stochastic Fairness qdisc"); 984