1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * net/sched/sch_sfq.c Stochastic Fairness Queueing discipline.
4 *
5 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6 */
7
8 #include <linux/module.h>
9 #include <linux/types.h>
10 #include <linux/kernel.h>
11 #include <linux/jiffies.h>
12 #include <linux/string.h>
13 #include <linux/in.h>
14 #include <linux/errno.h>
15 #include <linux/init.h>
16 #include <linux/skbuff.h>
17 #include <linux/siphash.h>
18 #include <linux/slab.h>
19 #include <linux/vmalloc.h>
20 #include <net/netlink.h>
21 #include <net/pkt_sched.h>
22 #include <net/pkt_cls.h>
23 #include <net/red.h>
24
25
26 /* Stochastic Fairness Queuing algorithm.
27 =======================================
28
29 Source:
30 Paul E. McKenney "Stochastic Fairness Queuing",
31 IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
32
33 Paul E. McKenney "Stochastic Fairness Queuing",
34 "Interworking: Research and Experience", v.2, 1991, p.113-131.
35
36
37 See also:
38 M. Shreedhar and George Varghese "Efficient Fair
39 Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
40
41
42 This is not the thing that is usually called (W)FQ nowadays.
43 It does not use any timestamp mechanism, but instead
44 processes queues in round-robin order.
45
46 ADVANTAGE:
47
48 - It is very cheap. Both CPU and memory requirements are minimal.
49
50 DRAWBACKS:
51
52 - "Stochastic" -> It is not 100% fair.
53 When hash collisions occur, several flows are considered as one.
54
55 - "Round-robin" -> It introduces larger delays than virtual clock
56 based schemes, and should not be used for isolating interactive
57 traffic from non-interactive. It means, that this scheduler
58 should be used as leaf of CBQ or P3, which put interactive traffic
59 to higher priority band.
60
61 We still need true WFQ for top level CSZ, but using WFQ
62 for the best effort traffic is absolutely pointless:
63 SFQ is superior for this purpose.
64
65 IMPLEMENTATION:
66 This implementation limits :
67 - maximal queue length per flow to 127 packets.
68 - max mtu to 2^18-1;
69 - max 65408 flows,
70 - number of hash buckets to 65536.
71
72 It is easy to increase these values, but not in flight. */
73
74 #define SFQ_MAX_DEPTH 127 /* max number of packets per flow */
75 #define SFQ_DEFAULT_FLOWS 128
76 #define SFQ_MAX_FLOWS (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
77 #define SFQ_EMPTY_SLOT 0xffff
78 #define SFQ_DEFAULT_HASH_DIVISOR 1024
79
80 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
81 typedef u16 sfq_index;
82
83 /*
84 * We dont use pointers to save space.
85 * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
86 * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
87 * are 'pointers' to dep[] array
88 */
89 struct sfq_head {
90 sfq_index next;
91 sfq_index prev;
92 };
93
94 struct sfq_slot {
95 struct sk_buff *skblist_next;
96 struct sk_buff *skblist_prev;
97 sfq_index qlen; /* number of skbs in skblist */
98 sfq_index next; /* next slot in sfq RR chain */
99 struct sfq_head dep; /* anchor in dep[] chains */
100 unsigned short hash; /* hash value (index in ht[]) */
101 int allot; /* credit for this slot */
102
103 unsigned int backlog;
104 struct red_vars vars;
105 };
106
107 struct sfq_sched_data {
108 /* frequently used fields */
109 int limit; /* limit of total number of packets in this qdisc */
110 unsigned int divisor; /* number of slots in hash table */
111 u8 headdrop;
112 u8 maxdepth; /* limit of packets per flow */
113
114 siphash_key_t perturbation;
115 u8 cur_depth; /* depth of longest slot */
116 u8 flags;
117 struct tcf_proto __rcu *filter_list;
118 struct tcf_block *block;
119 sfq_index *ht; /* Hash table ('divisor' slots) */
120 struct sfq_slot *slots; /* Flows table ('maxflows' entries) */
121
122 struct red_parms *red_parms;
123 struct tc_sfqred_stats stats;
124 struct sfq_slot *tail; /* current slot in round */
125
126 struct sfq_head dep[SFQ_MAX_DEPTH + 1];
127 /* Linked lists of slots, indexed by depth
128 * dep[0] : list of unused flows
129 * dep[1] : list of flows with 1 packet
130 * dep[X] : list of flows with X packets
131 */
132
133 unsigned int maxflows; /* number of flows in flows array */
134 int perturb_period;
135 unsigned int quantum; /* Allotment per round: MUST BE >= MTU */
136 struct timer_list perturb_timer;
137 struct Qdisc *sch;
138 };
139
140 /*
141 * sfq_head are either in a sfq_slot or in dep[] array
142 */
sfq_dep_head(struct sfq_sched_data * q,sfq_index val)143 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
144 {
145 if (val < SFQ_MAX_FLOWS)
146 return &q->slots[val].dep;
147 return &q->dep[val - SFQ_MAX_FLOWS];
148 }
149
sfq_hash(const struct sfq_sched_data * q,const struct sk_buff * skb)150 static unsigned int sfq_hash(const struct sfq_sched_data *q,
151 const struct sk_buff *skb)
152 {
153 return skb_get_hash_perturb(skb, &q->perturbation) & (q->divisor - 1);
154 }
155
sfq_classify(struct sk_buff * skb,struct Qdisc * sch,int * qerr)156 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
157 int *qerr)
158 {
159 struct sfq_sched_data *q = qdisc_priv(sch);
160 struct tcf_result res;
161 struct tcf_proto *fl;
162 int result;
163
164 if (TC_H_MAJ(skb->priority) == sch->handle &&
165 TC_H_MIN(skb->priority) > 0 &&
166 TC_H_MIN(skb->priority) <= q->divisor)
167 return TC_H_MIN(skb->priority);
168
169 fl = rcu_dereference_bh(q->filter_list);
170 if (!fl)
171 return sfq_hash(q, skb) + 1;
172
173 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
174 result = tcf_classify(skb, NULL, fl, &res, false);
175 if (result >= 0) {
176 #ifdef CONFIG_NET_CLS_ACT
177 switch (result) {
178 case TC_ACT_STOLEN:
179 case TC_ACT_QUEUED:
180 case TC_ACT_TRAP:
181 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
182 fallthrough;
183 case TC_ACT_SHOT:
184 return 0;
185 }
186 #endif
187 if (TC_H_MIN(res.classid) <= q->divisor)
188 return TC_H_MIN(res.classid);
189 }
190 return 0;
191 }
192
193 /*
194 * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
195 */
sfq_link(struct sfq_sched_data * q,sfq_index x)196 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
197 {
198 sfq_index p, n;
199 struct sfq_slot *slot = &q->slots[x];
200 int qlen = slot->qlen;
201
202 p = qlen + SFQ_MAX_FLOWS;
203 n = q->dep[qlen].next;
204
205 slot->dep.next = n;
206 slot->dep.prev = p;
207
208 q->dep[qlen].next = x; /* sfq_dep_head(q, p)->next = x */
209 sfq_dep_head(q, n)->prev = x;
210 }
211
212 #define sfq_unlink(q, x, n, p) \
213 do { \
214 n = q->slots[x].dep.next; \
215 p = q->slots[x].dep.prev; \
216 sfq_dep_head(q, p)->next = n; \
217 sfq_dep_head(q, n)->prev = p; \
218 } while (0)
219
220
sfq_dec(struct sfq_sched_data * q,sfq_index x)221 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
222 {
223 sfq_index p, n;
224 int d;
225
226 sfq_unlink(q, x, n, p);
227
228 d = q->slots[x].qlen--;
229 if (n == p && q->cur_depth == d)
230 q->cur_depth--;
231 sfq_link(q, x);
232 }
233
sfq_inc(struct sfq_sched_data * q,sfq_index x)234 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
235 {
236 sfq_index p, n;
237 int d;
238
239 sfq_unlink(q, x, n, p);
240
241 d = ++q->slots[x].qlen;
242 if (q->cur_depth < d)
243 q->cur_depth = d;
244 sfq_link(q, x);
245 }
246
247 /* helper functions : might be changed when/if skb use a standard list_head */
248
249 /* remove one skb from tail of slot queue */
slot_dequeue_tail(struct sfq_slot * slot)250 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
251 {
252 struct sk_buff *skb = slot->skblist_prev;
253
254 slot->skblist_prev = skb->prev;
255 skb->prev->next = (struct sk_buff *)slot;
256 skb->next = skb->prev = NULL;
257 return skb;
258 }
259
260 /* remove one skb from head of slot queue */
slot_dequeue_head(struct sfq_slot * slot)261 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
262 {
263 struct sk_buff *skb = slot->skblist_next;
264
265 slot->skblist_next = skb->next;
266 skb->next->prev = (struct sk_buff *)slot;
267 skb->next = skb->prev = NULL;
268 return skb;
269 }
270
slot_queue_init(struct sfq_slot * slot)271 static inline void slot_queue_init(struct sfq_slot *slot)
272 {
273 memset(slot, 0, sizeof(*slot));
274 slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
275 }
276
277 /* add skb to slot queue (tail add) */
slot_queue_add(struct sfq_slot * slot,struct sk_buff * skb)278 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
279 {
280 skb->prev = slot->skblist_prev;
281 skb->next = (struct sk_buff *)slot;
282 slot->skblist_prev->next = skb;
283 slot->skblist_prev = skb;
284 }
285
sfq_drop(struct Qdisc * sch,struct sk_buff ** to_free)286 static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free)
287 {
288 struct sfq_sched_data *q = qdisc_priv(sch);
289 sfq_index x, d = q->cur_depth;
290 struct sk_buff *skb;
291 unsigned int len;
292 struct sfq_slot *slot;
293
294 /* Queue is full! Find the longest slot and drop tail packet from it */
295 if (d > 1) {
296 x = q->dep[d].next;
297 slot = &q->slots[x];
298 drop:
299 skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
300 len = qdisc_pkt_len(skb);
301 slot->backlog -= len;
302 sfq_dec(q, x);
303 sch->q.qlen--;
304 qdisc_qstats_backlog_dec(sch, skb);
305 qdisc_drop(skb, sch, to_free);
306 return len;
307 }
308
309 if (d == 1) {
310 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
311 x = q->tail->next;
312 slot = &q->slots[x];
313 if (slot->next == x)
314 q->tail = NULL; /* no more active slots */
315 else
316 q->tail->next = slot->next;
317 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
318 goto drop;
319 }
320
321 return 0;
322 }
323
324 /* Is ECN parameter configured */
sfq_prob_mark(const struct sfq_sched_data * q)325 static int sfq_prob_mark(const struct sfq_sched_data *q)
326 {
327 return q->flags & TC_RED_ECN;
328 }
329
330 /* Should packets over max threshold just be marked */
sfq_hard_mark(const struct sfq_sched_data * q)331 static int sfq_hard_mark(const struct sfq_sched_data *q)
332 {
333 return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
334 }
335
sfq_headdrop(const struct sfq_sched_data * q)336 static int sfq_headdrop(const struct sfq_sched_data *q)
337 {
338 return q->headdrop;
339 }
340
341 static int
sfq_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)342 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
343 {
344 struct sfq_sched_data *q = qdisc_priv(sch);
345 unsigned int hash, dropped;
346 sfq_index x, qlen;
347 struct sfq_slot *slot;
348 int ret;
349 struct sk_buff *head;
350 int delta;
351
352 hash = sfq_classify(skb, sch, &ret);
353 if (hash == 0) {
354 if (ret & __NET_XMIT_BYPASS)
355 qdisc_qstats_drop(sch);
356 __qdisc_drop(skb, to_free);
357 return ret;
358 }
359 hash--;
360
361 x = q->ht[hash];
362 slot = &q->slots[x];
363 if (x == SFQ_EMPTY_SLOT) {
364 x = q->dep[0].next; /* get a free slot */
365 if (x >= SFQ_MAX_FLOWS)
366 return qdisc_drop(skb, sch, to_free);
367 q->ht[hash] = x;
368 slot = &q->slots[x];
369 slot->hash = hash;
370 slot->backlog = 0; /* should already be 0 anyway... */
371 red_set_vars(&slot->vars);
372 goto enqueue;
373 }
374 if (q->red_parms) {
375 slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
376 &slot->vars,
377 slot->backlog);
378 switch (red_action(q->red_parms,
379 &slot->vars,
380 slot->vars.qavg)) {
381 case RED_DONT_MARK:
382 break;
383
384 case RED_PROB_MARK:
385 qdisc_qstats_overlimit(sch);
386 if (sfq_prob_mark(q)) {
387 /* We know we have at least one packet in queue */
388 if (sfq_headdrop(q) &&
389 INET_ECN_set_ce(slot->skblist_next)) {
390 q->stats.prob_mark_head++;
391 break;
392 }
393 if (INET_ECN_set_ce(skb)) {
394 q->stats.prob_mark++;
395 break;
396 }
397 }
398 q->stats.prob_drop++;
399 goto congestion_drop;
400
401 case RED_HARD_MARK:
402 qdisc_qstats_overlimit(sch);
403 if (sfq_hard_mark(q)) {
404 /* We know we have at least one packet in queue */
405 if (sfq_headdrop(q) &&
406 INET_ECN_set_ce(slot->skblist_next)) {
407 q->stats.forced_mark_head++;
408 break;
409 }
410 if (INET_ECN_set_ce(skb)) {
411 q->stats.forced_mark++;
412 break;
413 }
414 }
415 q->stats.forced_drop++;
416 goto congestion_drop;
417 }
418 }
419
420 if (slot->qlen >= q->maxdepth) {
421 congestion_drop:
422 if (!sfq_headdrop(q))
423 return qdisc_drop(skb, sch, to_free);
424
425 /* We know we have at least one packet in queue */
426 head = slot_dequeue_head(slot);
427 delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
428 sch->qstats.backlog -= delta;
429 slot->backlog -= delta;
430 qdisc_drop(head, sch, to_free);
431
432 slot_queue_add(slot, skb);
433 qdisc_tree_reduce_backlog(sch, 0, delta);
434 return NET_XMIT_CN;
435 }
436
437 enqueue:
438 qdisc_qstats_backlog_inc(sch, skb);
439 slot->backlog += qdisc_pkt_len(skb);
440 slot_queue_add(slot, skb);
441 sfq_inc(q, x);
442 if (slot->qlen == 1) { /* The flow is new */
443 if (q->tail == NULL) { /* It is the first flow */
444 slot->next = x;
445 } else {
446 slot->next = q->tail->next;
447 q->tail->next = x;
448 }
449 /* We put this flow at the end of our flow list.
450 * This might sound unfair for a new flow to wait after old ones,
451 * but we could endup servicing new flows only, and freeze old ones.
452 */
453 q->tail = slot;
454 /* We could use a bigger initial quantum for new flows */
455 slot->allot = q->quantum;
456 }
457 if (++sch->q.qlen <= q->limit)
458 return NET_XMIT_SUCCESS;
459
460 qlen = slot->qlen;
461 dropped = sfq_drop(sch, to_free);
462 /* Return Congestion Notification only if we dropped a packet
463 * from this flow.
464 */
465 if (qlen != slot->qlen) {
466 qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
467 return NET_XMIT_CN;
468 }
469
470 /* As we dropped a packet, better let upper stack know this */
471 qdisc_tree_reduce_backlog(sch, 1, dropped);
472 return NET_XMIT_SUCCESS;
473 }
474
475 static struct sk_buff *
sfq_dequeue(struct Qdisc * sch)476 sfq_dequeue(struct Qdisc *sch)
477 {
478 struct sfq_sched_data *q = qdisc_priv(sch);
479 struct sk_buff *skb;
480 sfq_index a, next_a;
481 struct sfq_slot *slot;
482
483 /* No active slots */
484 if (q->tail == NULL)
485 return NULL;
486
487 next_slot:
488 a = q->tail->next;
489 slot = &q->slots[a];
490 if (slot->allot <= 0) {
491 q->tail = slot;
492 slot->allot += q->quantum;
493 goto next_slot;
494 }
495 skb = slot_dequeue_head(slot);
496 sfq_dec(q, a);
497 qdisc_bstats_update(sch, skb);
498 sch->q.qlen--;
499 qdisc_qstats_backlog_dec(sch, skb);
500 slot->backlog -= qdisc_pkt_len(skb);
501 /* Is the slot empty? */
502 if (slot->qlen == 0) {
503 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
504 next_a = slot->next;
505 if (a == next_a) {
506 q->tail = NULL; /* no more active slots */
507 return skb;
508 }
509 q->tail->next = next_a;
510 } else {
511 slot->allot -= qdisc_pkt_len(skb);
512 }
513 return skb;
514 }
515
516 static void
sfq_reset(struct Qdisc * sch)517 sfq_reset(struct Qdisc *sch)
518 {
519 struct sk_buff *skb;
520
521 while ((skb = sfq_dequeue(sch)) != NULL)
522 rtnl_kfree_skbs(skb, skb);
523 }
524
525 /*
526 * When q->perturbation is changed, we rehash all queued skbs
527 * to avoid OOO (Out Of Order) effects.
528 * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
529 * counters.
530 */
sfq_rehash(struct Qdisc * sch)531 static void sfq_rehash(struct Qdisc *sch)
532 {
533 struct sfq_sched_data *q = qdisc_priv(sch);
534 struct sk_buff *skb;
535 int i;
536 struct sfq_slot *slot;
537 struct sk_buff_head list;
538 int dropped = 0;
539 unsigned int drop_len = 0;
540
541 __skb_queue_head_init(&list);
542
543 for (i = 0; i < q->maxflows; i++) {
544 slot = &q->slots[i];
545 if (!slot->qlen)
546 continue;
547 while (slot->qlen) {
548 skb = slot_dequeue_head(slot);
549 sfq_dec(q, i);
550 __skb_queue_tail(&list, skb);
551 }
552 slot->backlog = 0;
553 red_set_vars(&slot->vars);
554 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
555 }
556 q->tail = NULL;
557
558 while ((skb = __skb_dequeue(&list)) != NULL) {
559 unsigned int hash = sfq_hash(q, skb);
560 sfq_index x = q->ht[hash];
561
562 slot = &q->slots[x];
563 if (x == SFQ_EMPTY_SLOT) {
564 x = q->dep[0].next; /* get a free slot */
565 if (x >= SFQ_MAX_FLOWS) {
566 drop:
567 qdisc_qstats_backlog_dec(sch, skb);
568 drop_len += qdisc_pkt_len(skb);
569 kfree_skb(skb);
570 dropped++;
571 continue;
572 }
573 q->ht[hash] = x;
574 slot = &q->slots[x];
575 slot->hash = hash;
576 }
577 if (slot->qlen >= q->maxdepth)
578 goto drop;
579 slot_queue_add(slot, skb);
580 if (q->red_parms)
581 slot->vars.qavg = red_calc_qavg(q->red_parms,
582 &slot->vars,
583 slot->backlog);
584 slot->backlog += qdisc_pkt_len(skb);
585 sfq_inc(q, x);
586 if (slot->qlen == 1) { /* The flow is new */
587 if (q->tail == NULL) { /* It is the first flow */
588 slot->next = x;
589 } else {
590 slot->next = q->tail->next;
591 q->tail->next = x;
592 }
593 q->tail = slot;
594 slot->allot = q->quantum;
595 }
596 }
597 sch->q.qlen -= dropped;
598 qdisc_tree_reduce_backlog(sch, dropped, drop_len);
599 }
600
sfq_perturbation(struct timer_list * t)601 static void sfq_perturbation(struct timer_list *t)
602 {
603 struct sfq_sched_data *q = timer_container_of(q, t, perturb_timer);
604 struct Qdisc *sch = q->sch;
605 spinlock_t *root_lock;
606 siphash_key_t nkey;
607 int period;
608
609 get_random_bytes(&nkey, sizeof(nkey));
610 rcu_read_lock();
611 root_lock = qdisc_lock(qdisc_root_sleeping(sch));
612 spin_lock(root_lock);
613 q->perturbation = nkey;
614 if (!q->filter_list && q->tail)
615 sfq_rehash(sch);
616 spin_unlock(root_lock);
617
618 /* q->perturb_period can change under us from
619 * sfq_change() and sfq_destroy().
620 */
621 period = READ_ONCE(q->perturb_period);
622 if (period)
623 mod_timer(&q->perturb_timer, jiffies + period);
624 rcu_read_unlock();
625 }
626
sfq_change(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)627 static int sfq_change(struct Qdisc *sch, struct nlattr *opt,
628 struct netlink_ext_ack *extack)
629 {
630 struct sfq_sched_data *q = qdisc_priv(sch);
631 struct tc_sfq_qopt *ctl = nla_data(opt);
632 struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
633 unsigned int qlen, dropped = 0;
634 struct red_parms *p = NULL;
635 struct sk_buff *to_free = NULL;
636 struct sk_buff *tail = NULL;
637 unsigned int maxflows;
638 unsigned int quantum;
639 unsigned int divisor;
640 int perturb_period;
641 u8 headdrop;
642 u8 maxdepth;
643 int limit;
644 u8 flags;
645
646
647 if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
648 return -EINVAL;
649 if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
650 ctl_v1 = nla_data(opt);
651 if (ctl->divisor &&
652 (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
653 return -EINVAL;
654
655 if ((int)ctl->quantum < 0) {
656 NL_SET_ERR_MSG_MOD(extack, "invalid quantum");
657 return -EINVAL;
658 }
659
660 if (ctl->perturb_period < 0 ||
661 ctl->perturb_period > INT_MAX / HZ) {
662 NL_SET_ERR_MSG_MOD(extack, "invalid perturb period");
663 return -EINVAL;
664 }
665 perturb_period = ctl->perturb_period * HZ;
666
667 if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
668 ctl_v1->Wlog, ctl_v1->Scell_log, NULL))
669 return -EINVAL;
670 if (ctl_v1 && ctl_v1->qth_min) {
671 p = kmalloc(sizeof(*p), GFP_KERNEL);
672 if (!p)
673 return -ENOMEM;
674 }
675
676 sch_tree_lock(sch);
677
678 limit = q->limit;
679 divisor = q->divisor;
680 headdrop = q->headdrop;
681 maxdepth = q->maxdepth;
682 maxflows = q->maxflows;
683 quantum = q->quantum;
684 flags = q->flags;
685
686 /* update and validate configuration */
687 if (ctl->quantum)
688 quantum = ctl->quantum;
689 if (ctl->flows)
690 maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
691 if (ctl->divisor) {
692 divisor = ctl->divisor;
693 maxflows = min_t(u32, maxflows, divisor);
694 }
695 if (ctl_v1) {
696 if (ctl_v1->depth)
697 maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
698 if (p) {
699 red_set_parms(p,
700 ctl_v1->qth_min, ctl_v1->qth_max,
701 ctl_v1->Wlog,
702 ctl_v1->Plog, ctl_v1->Scell_log,
703 NULL,
704 ctl_v1->max_P);
705 }
706 flags = ctl_v1->flags;
707 headdrop = ctl_v1->headdrop;
708 }
709 if (ctl->limit) {
710 limit = min_t(u32, ctl->limit, maxdepth * maxflows);
711 maxflows = min_t(u32, maxflows, limit);
712 }
713 if (limit == 1) {
714 sch_tree_unlock(sch);
715 kfree(p);
716 NL_SET_ERR_MSG_MOD(extack, "invalid limit");
717 return -EINVAL;
718 }
719
720 /* commit configuration */
721 q->limit = limit;
722 q->divisor = divisor;
723 q->headdrop = headdrop;
724 q->maxdepth = maxdepth;
725 q->maxflows = maxflows;
726 WRITE_ONCE(q->perturb_period, perturb_period);
727 q->quantum = quantum;
728 q->flags = flags;
729 if (p)
730 swap(q->red_parms, p);
731
732 qlen = sch->q.qlen;
733 while (sch->q.qlen > q->limit) {
734 dropped += sfq_drop(sch, &to_free);
735 if (!tail)
736 tail = to_free;
737 }
738
739 rtnl_kfree_skbs(to_free, tail);
740 qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
741
742 timer_delete(&q->perturb_timer);
743 if (q->perturb_period) {
744 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
745 get_random_bytes(&q->perturbation, sizeof(q->perturbation));
746 }
747 sch_tree_unlock(sch);
748 kfree(p);
749 return 0;
750 }
751
sfq_alloc(size_t sz)752 static void *sfq_alloc(size_t sz)
753 {
754 return kvmalloc(sz, GFP_KERNEL);
755 }
756
sfq_free(void * addr)757 static void sfq_free(void *addr)
758 {
759 kvfree(addr);
760 }
761
sfq_destroy(struct Qdisc * sch)762 static void sfq_destroy(struct Qdisc *sch)
763 {
764 struct sfq_sched_data *q = qdisc_priv(sch);
765
766 tcf_block_put(q->block);
767 WRITE_ONCE(q->perturb_period, 0);
768 timer_delete_sync(&q->perturb_timer);
769 sfq_free(q->ht);
770 sfq_free(q->slots);
771 kfree(q->red_parms);
772 }
773
sfq_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)774 static int sfq_init(struct Qdisc *sch, struct nlattr *opt,
775 struct netlink_ext_ack *extack)
776 {
777 struct sfq_sched_data *q = qdisc_priv(sch);
778 int i;
779 int err;
780
781 q->sch = sch;
782 timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE);
783
784 err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
785 if (err)
786 return err;
787
788 for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
789 q->dep[i].next = i + SFQ_MAX_FLOWS;
790 q->dep[i].prev = i + SFQ_MAX_FLOWS;
791 }
792
793 q->limit = SFQ_MAX_DEPTH;
794 q->maxdepth = SFQ_MAX_DEPTH;
795 q->cur_depth = 0;
796 q->tail = NULL;
797 q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
798 q->maxflows = SFQ_DEFAULT_FLOWS;
799 q->quantum = psched_mtu(qdisc_dev(sch));
800 q->perturb_period = 0;
801 get_random_bytes(&q->perturbation, sizeof(q->perturbation));
802
803 if (opt) {
804 int err = sfq_change(sch, opt, extack);
805 if (err)
806 return err;
807 }
808
809 q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
810 q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
811 if (!q->ht || !q->slots) {
812 /* Note: sfq_destroy() will be called by our caller */
813 return -ENOMEM;
814 }
815
816 for (i = 0; i < q->divisor; i++)
817 q->ht[i] = SFQ_EMPTY_SLOT;
818
819 for (i = 0; i < q->maxflows; i++) {
820 slot_queue_init(&q->slots[i]);
821 sfq_link(q, i);
822 }
823 if (q->limit >= 1)
824 sch->flags |= TCQ_F_CAN_BYPASS;
825 else
826 sch->flags &= ~TCQ_F_CAN_BYPASS;
827 return 0;
828 }
829
sfq_dump(struct Qdisc * sch,struct sk_buff * skb)830 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
831 {
832 struct sfq_sched_data *q = qdisc_priv(sch);
833 unsigned char *b = skb_tail_pointer(skb);
834 struct tc_sfq_qopt_v1 opt;
835 struct red_parms *p = q->red_parms;
836
837 memset(&opt, 0, sizeof(opt));
838 opt.v0.quantum = q->quantum;
839 opt.v0.perturb_period = q->perturb_period / HZ;
840 opt.v0.limit = q->limit;
841 opt.v0.divisor = q->divisor;
842 opt.v0.flows = q->maxflows;
843 opt.depth = q->maxdepth;
844 opt.headdrop = q->headdrop;
845
846 if (p) {
847 opt.qth_min = p->qth_min >> p->Wlog;
848 opt.qth_max = p->qth_max >> p->Wlog;
849 opt.Wlog = p->Wlog;
850 opt.Plog = p->Plog;
851 opt.Scell_log = p->Scell_log;
852 opt.max_P = p->max_P;
853 }
854 memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
855 opt.flags = q->flags;
856
857 if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
858 goto nla_put_failure;
859
860 return skb->len;
861
862 nla_put_failure:
863 nlmsg_trim(skb, b);
864 return -1;
865 }
866
sfq_leaf(struct Qdisc * sch,unsigned long arg)867 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
868 {
869 return NULL;
870 }
871
sfq_find(struct Qdisc * sch,u32 classid)872 static unsigned long sfq_find(struct Qdisc *sch, u32 classid)
873 {
874 return 0;
875 }
876
sfq_bind(struct Qdisc * sch,unsigned long parent,u32 classid)877 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
878 u32 classid)
879 {
880 return 0;
881 }
882
sfq_unbind(struct Qdisc * q,unsigned long cl)883 static void sfq_unbind(struct Qdisc *q, unsigned long cl)
884 {
885 }
886
sfq_tcf_block(struct Qdisc * sch,unsigned long cl,struct netlink_ext_ack * extack)887 static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl,
888 struct netlink_ext_ack *extack)
889 {
890 struct sfq_sched_data *q = qdisc_priv(sch);
891
892 if (cl)
893 return NULL;
894 return q->block;
895 }
896
sfq_dump_class(struct Qdisc * sch,unsigned long cl,struct sk_buff * skb,struct tcmsg * tcm)897 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
898 struct sk_buff *skb, struct tcmsg *tcm)
899 {
900 tcm->tcm_handle |= TC_H_MIN(cl);
901 return 0;
902 }
903
sfq_dump_class_stats(struct Qdisc * sch,unsigned long cl,struct gnet_dump * d)904 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
905 struct gnet_dump *d)
906 {
907 struct sfq_sched_data *q = qdisc_priv(sch);
908 sfq_index idx = q->ht[cl - 1];
909 struct gnet_stats_queue qs = { 0 };
910 struct tc_sfq_xstats xstats = { 0 };
911
912 if (idx != SFQ_EMPTY_SLOT) {
913 const struct sfq_slot *slot = &q->slots[idx];
914
915 xstats.allot = slot->allot;
916 qs.qlen = slot->qlen;
917 qs.backlog = slot->backlog;
918 }
919 if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
920 return -1;
921 return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
922 }
923
sfq_walk(struct Qdisc * sch,struct qdisc_walker * arg)924 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
925 {
926 struct sfq_sched_data *q = qdisc_priv(sch);
927 unsigned int i;
928
929 if (arg->stop)
930 return;
931
932 for (i = 0; i < q->divisor; i++) {
933 if (q->ht[i] == SFQ_EMPTY_SLOT) {
934 arg->count++;
935 continue;
936 }
937 if (!tc_qdisc_stats_dump(sch, i + 1, arg))
938 break;
939 }
940 }
941
942 static const struct Qdisc_class_ops sfq_class_ops = {
943 .leaf = sfq_leaf,
944 .find = sfq_find,
945 .tcf_block = sfq_tcf_block,
946 .bind_tcf = sfq_bind,
947 .unbind_tcf = sfq_unbind,
948 .dump = sfq_dump_class,
949 .dump_stats = sfq_dump_class_stats,
950 .walk = sfq_walk,
951 };
952
953 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
954 .cl_ops = &sfq_class_ops,
955 .id = "sfq",
956 .priv_size = sizeof(struct sfq_sched_data),
957 .enqueue = sfq_enqueue,
958 .dequeue = sfq_dequeue,
959 .peek = qdisc_peek_dequeued,
960 .init = sfq_init,
961 .reset = sfq_reset,
962 .destroy = sfq_destroy,
963 .change = NULL,
964 .dump = sfq_dump,
965 .owner = THIS_MODULE,
966 };
967 MODULE_ALIAS_NET_SCH("sfq");
968
sfq_module_init(void)969 static int __init sfq_module_init(void)
970 {
971 return register_qdisc(&sfq_qdisc_ops);
972 }
sfq_module_exit(void)973 static void __exit sfq_module_exit(void)
974 {
975 unregister_qdisc(&sfq_qdisc_ops);
976 }
977 module_init(sfq_module_init)
978 module_exit(sfq_module_exit)
979 MODULE_LICENSE("GPL");
980 MODULE_DESCRIPTION("Stochastic Fairness qdisc");
981