1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * net/sched/sch_sfq.c Stochastic Fairness Queueing discipline.
4 *
5 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6 */
7
8 #include <linux/module.h>
9 #include <linux/types.h>
10 #include <linux/kernel.h>
11 #include <linux/jiffies.h>
12 #include <linux/string.h>
13 #include <linux/in.h>
14 #include <linux/errno.h>
15 #include <linux/init.h>
16 #include <linux/skbuff.h>
17 #include <linux/siphash.h>
18 #include <linux/slab.h>
19 #include <linux/vmalloc.h>
20 #include <net/netlink.h>
21 #include <net/pkt_sched.h>
22 #include <net/pkt_cls.h>
23 #include <net/red.h>
24
25
26 /* Stochastic Fairness Queuing algorithm.
27 =======================================
28
29 Source:
30 Paul E. McKenney "Stochastic Fairness Queuing",
31 IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
32
33 Paul E. McKenney "Stochastic Fairness Queuing",
34 "Interworking: Research and Experience", v.2, 1991, p.113-131.
35
36
37 See also:
38 M. Shreedhar and George Varghese "Efficient Fair
39 Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
40
41
42 This is not the thing that is usually called (W)FQ nowadays.
43 It does not use any timestamp mechanism, but instead
44 processes queues in round-robin order.
45
46 ADVANTAGE:
47
48 - It is very cheap. Both CPU and memory requirements are minimal.
49
50 DRAWBACKS:
51
52 - "Stochastic" -> It is not 100% fair.
53 When hash collisions occur, several flows are considered as one.
54
55 - "Round-robin" -> It introduces larger delays than virtual clock
56 based schemes, and should not be used for isolating interactive
57 traffic from non-interactive. It means, that this scheduler
58 should be used as leaf of CBQ or P3, which put interactive traffic
59 to higher priority band.
60
61 We still need true WFQ for top level CSZ, but using WFQ
62 for the best effort traffic is absolutely pointless:
63 SFQ is superior for this purpose.
64
65 IMPLEMENTATION:
66 This implementation limits :
67 - maximal queue length per flow to 127 packets.
68 - max mtu to 2^18-1;
69 - max 65408 flows,
70 - number of hash buckets to 65536.
71
72 It is easy to increase these values, but not in flight. */
73
74 #define SFQ_MAX_DEPTH 127 /* max number of packets per flow */
75 #define SFQ_DEFAULT_FLOWS 128
76 #define SFQ_MAX_FLOWS (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
77 #define SFQ_EMPTY_SLOT 0xffff
78 #define SFQ_DEFAULT_HASH_DIVISOR 1024
79
80 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
81 typedef u16 sfq_index;
82
83 /*
84 * We dont use pointers to save space.
85 * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
86 * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
87 * are 'pointers' to dep[] array
88 */
89 struct sfq_head {
90 sfq_index next;
91 sfq_index prev;
92 };
93
94 struct sfq_slot {
95 struct sk_buff *skblist_next;
96 struct sk_buff *skblist_prev;
97 sfq_index qlen; /* number of skbs in skblist */
98 sfq_index next; /* next slot in sfq RR chain */
99 struct sfq_head dep; /* anchor in dep[] chains */
100 unsigned short hash; /* hash value (index in ht[]) */
101 int allot; /* credit for this slot */
102
103 unsigned int backlog;
104 struct red_vars vars;
105 };
106
107 struct sfq_sched_data {
108 /* frequently used fields */
109 int limit; /* limit of total number of packets in this qdisc */
110 unsigned int divisor; /* number of slots in hash table */
111 u8 headdrop;
112 u8 maxdepth; /* limit of packets per flow */
113
114 siphash_key_t perturbation;
115 u8 cur_depth; /* depth of longest slot */
116 u8 flags;
117 struct tcf_proto __rcu *filter_list;
118 struct tcf_block *block;
119 sfq_index *ht; /* Hash table ('divisor' slots) */
120 struct sfq_slot *slots; /* Flows table ('maxflows' entries) */
121
122 struct red_parms *red_parms;
123 struct tc_sfqred_stats stats;
124 struct sfq_slot *tail; /* current slot in round */
125
126 struct sfq_head dep[SFQ_MAX_DEPTH + 1];
127 /* Linked lists of slots, indexed by depth
128 * dep[0] : list of unused flows
129 * dep[1] : list of flows with 1 packet
130 * dep[X] : list of flows with X packets
131 */
132
133 unsigned int maxflows; /* number of flows in flows array */
134 int perturb_period;
135 unsigned int quantum; /* Allotment per round: MUST BE >= MTU */
136 struct timer_list perturb_timer;
137 struct Qdisc *sch;
138 };
139
140 /*
141 * sfq_head are either in a sfq_slot or in dep[] array
142 */
sfq_dep_head(struct sfq_sched_data * q,sfq_index val)143 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
144 {
145 if (val < SFQ_MAX_FLOWS)
146 return &q->slots[val].dep;
147 return &q->dep[val - SFQ_MAX_FLOWS];
148 }
149
sfq_hash(const struct sfq_sched_data * q,const struct sk_buff * skb)150 static unsigned int sfq_hash(const struct sfq_sched_data *q,
151 const struct sk_buff *skb)
152 {
153 return skb_get_hash_perturb(skb, &q->perturbation) & (q->divisor - 1);
154 }
155
sfq_classify(struct sk_buff * skb,struct Qdisc * sch,int * qerr)156 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
157 int *qerr)
158 {
159 struct sfq_sched_data *q = qdisc_priv(sch);
160 struct tcf_result res;
161 struct tcf_proto *fl;
162 int result;
163
164 if (TC_H_MAJ(skb->priority) == sch->handle &&
165 TC_H_MIN(skb->priority) > 0 &&
166 TC_H_MIN(skb->priority) <= q->divisor)
167 return TC_H_MIN(skb->priority);
168
169 fl = rcu_dereference_bh(q->filter_list);
170 if (!fl)
171 return sfq_hash(q, skb) + 1;
172
173 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
174 result = tcf_classify(skb, NULL, fl, &res, false);
175 if (result >= 0) {
176 #ifdef CONFIG_NET_CLS_ACT
177 switch (result) {
178 case TC_ACT_STOLEN:
179 case TC_ACT_QUEUED:
180 case TC_ACT_TRAP:
181 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
182 fallthrough;
183 case TC_ACT_SHOT:
184 return 0;
185 }
186 #endif
187 if (TC_H_MIN(res.classid) <= q->divisor)
188 return TC_H_MIN(res.classid);
189 }
190 return 0;
191 }
192
193 /*
194 * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
195 */
sfq_link(struct sfq_sched_data * q,sfq_index x)196 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
197 {
198 sfq_index p, n;
199 struct sfq_slot *slot = &q->slots[x];
200 int qlen = slot->qlen;
201
202 p = qlen + SFQ_MAX_FLOWS;
203 n = q->dep[qlen].next;
204
205 slot->dep.next = n;
206 slot->dep.prev = p;
207
208 q->dep[qlen].next = x; /* sfq_dep_head(q, p)->next = x */
209 sfq_dep_head(q, n)->prev = x;
210 }
211
212 #define sfq_unlink(q, x, n, p) \
213 do { \
214 n = q->slots[x].dep.next; \
215 p = q->slots[x].dep.prev; \
216 sfq_dep_head(q, p)->next = n; \
217 sfq_dep_head(q, n)->prev = p; \
218 } while (0)
219
220
sfq_dec(struct sfq_sched_data * q,sfq_index x)221 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
222 {
223 sfq_index p, n;
224 int d;
225
226 sfq_unlink(q, x, n, p);
227
228 d = q->slots[x].qlen;
229 WRITE_ONCE(q->slots[x].qlen, d - 1);
230 if (n == p && q->cur_depth == d)
231 q->cur_depth--;
232 sfq_link(q, x);
233 }
234
sfq_inc(struct sfq_sched_data * q,sfq_index x)235 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
236 {
237 sfq_index p, n;
238 int d;
239
240 sfq_unlink(q, x, n, p);
241
242 d = q->slots[x].qlen + 1;
243 WRITE_ONCE(q->slots[x].qlen, d);
244 if (q->cur_depth < d)
245 q->cur_depth = d;
246 sfq_link(q, x);
247 }
248
249 /* helper functions : might be changed when/if skb use a standard list_head */
250
251 /* remove one skb from tail of slot queue */
slot_dequeue_tail(struct sfq_slot * slot)252 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
253 {
254 struct sk_buff *skb = slot->skblist_prev;
255
256 slot->skblist_prev = skb->prev;
257 skb->prev->next = (struct sk_buff *)slot;
258 skb->next = skb->prev = NULL;
259 return skb;
260 }
261
262 /* remove one skb from head of slot queue */
slot_dequeue_head(struct sfq_slot * slot)263 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
264 {
265 struct sk_buff *skb = slot->skblist_next;
266
267 slot->skblist_next = skb->next;
268 skb->next->prev = (struct sk_buff *)slot;
269 skb->next = skb->prev = NULL;
270 return skb;
271 }
272
slot_queue_init(struct sfq_slot * slot)273 static inline void slot_queue_init(struct sfq_slot *slot)
274 {
275 memset(slot, 0, sizeof(*slot));
276 slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
277 }
278
279 /* add skb to slot queue (tail add) */
slot_queue_add(struct sfq_slot * slot,struct sk_buff * skb)280 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
281 {
282 skb->prev = slot->skblist_prev;
283 skb->next = (struct sk_buff *)slot;
284 slot->skblist_prev->next = skb;
285 slot->skblist_prev = skb;
286 }
287
sfq_drop(struct Qdisc * sch,struct sk_buff ** to_free)288 static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free)
289 {
290 struct sfq_sched_data *q = qdisc_priv(sch);
291 sfq_index x, d = q->cur_depth;
292 struct sk_buff *skb;
293 unsigned int len;
294 struct sfq_slot *slot;
295
296 /* Queue is full! Find the longest slot and drop tail packet from it */
297 if (d > 1) {
298 x = q->dep[d].next;
299 slot = &q->slots[x];
300 drop:
301 skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
302 len = qdisc_pkt_len(skb);
303 WRITE_ONCE(slot->backlog, slot->backlog - len);
304 sfq_dec(q, x);
305 sch->q.qlen--;
306 qdisc_qstats_backlog_dec(sch, skb);
307 qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_OVERLIMIT);
308 return len;
309 }
310
311 if (d == 1) {
312 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
313 x = q->tail->next;
314 slot = &q->slots[x];
315 if (slot->next == x)
316 q->tail = NULL; /* no more active slots */
317 else
318 q->tail->next = slot->next;
319 WRITE_ONCE(q->ht[slot->hash], SFQ_EMPTY_SLOT);
320 goto drop;
321 }
322
323 return 0;
324 }
325
326 /* Is ECN parameter configured */
sfq_prob_mark(const struct sfq_sched_data * q)327 static int sfq_prob_mark(const struct sfq_sched_data *q)
328 {
329 return q->flags & TC_RED_ECN;
330 }
331
332 /* Should packets over max threshold just be marked */
sfq_hard_mark(const struct sfq_sched_data * q)333 static int sfq_hard_mark(const struct sfq_sched_data *q)
334 {
335 return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
336 }
337
sfq_headdrop(const struct sfq_sched_data * q)338 static int sfq_headdrop(const struct sfq_sched_data *q)
339 {
340 return q->headdrop;
341 }
342
343 static int
sfq_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)344 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
345 {
346 struct sfq_sched_data *q = qdisc_priv(sch);
347 unsigned int hash, dropped;
348 sfq_index x, qlen;
349 struct sfq_slot *slot;
350 int ret;
351 struct sk_buff *head;
352 int delta;
353
354 hash = sfq_classify(skb, sch, &ret);
355 if (hash == 0) {
356 if (ret & __NET_XMIT_BYPASS)
357 qdisc_qstats_drop(sch);
358 __qdisc_drop(skb, to_free);
359 return ret;
360 }
361 hash--;
362
363 x = q->ht[hash];
364 slot = &q->slots[x];
365 if (x == SFQ_EMPTY_SLOT) {
366 x = q->dep[0].next; /* get a free slot */
367 if (x >= SFQ_MAX_FLOWS)
368 return qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_MAXFLOWS);
369 WRITE_ONCE(q->ht[hash], x);
370 slot = &q->slots[x];
371 slot->hash = hash;
372 WRITE_ONCE(slot->backlog, 0); /* should already be 0 anyway... */
373 red_set_vars(&slot->vars);
374 goto enqueue;
375 }
376 if (q->red_parms) {
377 slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
378 &slot->vars,
379 slot->backlog);
380 switch (red_action(q->red_parms,
381 &slot->vars,
382 slot->vars.qavg)) {
383 case RED_DONT_MARK:
384 break;
385
386 case RED_PROB_MARK:
387 qdisc_qstats_overlimit(sch);
388 if (sfq_prob_mark(q)) {
389 /* We know we have at least one packet in queue */
390 if (sfq_headdrop(q) &&
391 INET_ECN_set_ce(slot->skblist_next)) {
392 q->stats.prob_mark_head++;
393 break;
394 }
395 if (INET_ECN_set_ce(skb)) {
396 q->stats.prob_mark++;
397 break;
398 }
399 }
400 q->stats.prob_drop++;
401 goto congestion_drop;
402
403 case RED_HARD_MARK:
404 qdisc_qstats_overlimit(sch);
405 if (sfq_hard_mark(q)) {
406 /* We know we have at least one packet in queue */
407 if (sfq_headdrop(q) &&
408 INET_ECN_set_ce(slot->skblist_next)) {
409 q->stats.forced_mark_head++;
410 break;
411 }
412 if (INET_ECN_set_ce(skb)) {
413 q->stats.forced_mark++;
414 break;
415 }
416 }
417 q->stats.forced_drop++;
418 goto congestion_drop;
419 }
420 }
421
422 if (slot->qlen >= q->maxdepth) {
423 congestion_drop:
424 if (!sfq_headdrop(q))
425 return qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_FLOW_LIMIT);
426
427 /* We know we have at least one packet in queue */
428 head = slot_dequeue_head(slot);
429 delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
430 sch->qstats.backlog -= delta;
431 WRITE_ONCE(slot->backlog, slot->backlog - delta);
432 qdisc_drop_reason(head, sch, to_free, QDISC_DROP_FLOW_LIMIT);
433
434 slot_queue_add(slot, skb);
435 qdisc_tree_reduce_backlog(sch, 0, delta);
436 return NET_XMIT_CN;
437 }
438
439 enqueue:
440 qdisc_qstats_backlog_inc(sch, skb);
441 WRITE_ONCE(slot->backlog, slot->backlog + qdisc_pkt_len(skb));
442 slot_queue_add(slot, skb);
443 sfq_inc(q, x);
444 if (slot->qlen == 1) { /* The flow is new */
445 if (q->tail == NULL) { /* It is the first flow */
446 slot->next = x;
447 } else {
448 slot->next = q->tail->next;
449 q->tail->next = x;
450 }
451 /* We put this flow at the end of our flow list.
452 * This might sound unfair for a new flow to wait after old ones,
453 * but we could endup servicing new flows only, and freeze old ones.
454 */
455 q->tail = slot;
456 /* We could use a bigger initial quantum for new flows */
457 WRITE_ONCE(slot->allot, q->quantum);
458 }
459 if (++sch->q.qlen <= q->limit)
460 return NET_XMIT_SUCCESS;
461
462 qlen = slot->qlen;
463 dropped = sfq_drop(sch, to_free);
464 /* Return Congestion Notification only if we dropped a packet
465 * from this flow.
466 */
467 if (qlen != slot->qlen) {
468 qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
469 return NET_XMIT_CN;
470 }
471
472 /* As we dropped a packet, better let upper stack know this */
473 qdisc_tree_reduce_backlog(sch, 1, dropped);
474 return NET_XMIT_SUCCESS;
475 }
476
477 static struct sk_buff *
sfq_dequeue(struct Qdisc * sch)478 sfq_dequeue(struct Qdisc *sch)
479 {
480 struct sfq_sched_data *q = qdisc_priv(sch);
481 struct sk_buff *skb;
482 sfq_index a, next_a;
483 struct sfq_slot *slot;
484
485 /* No active slots */
486 if (q->tail == NULL)
487 return NULL;
488
489 next_slot:
490 a = q->tail->next;
491 slot = &q->slots[a];
492 if (slot->allot <= 0) {
493 q->tail = slot;
494 WRITE_ONCE(slot->allot, slot->allot + q->quantum);
495 goto next_slot;
496 }
497 skb = slot_dequeue_head(slot);
498 sfq_dec(q, a);
499 qdisc_bstats_update(sch, skb);
500 sch->q.qlen--;
501 qdisc_qstats_backlog_dec(sch, skb);
502 WRITE_ONCE(slot->backlog, slot->backlog - qdisc_pkt_len(skb));
503 /* Is the slot empty? */
504 if (slot->qlen == 0) {
505 WRITE_ONCE(q->ht[slot->hash], SFQ_EMPTY_SLOT);
506 next_a = slot->next;
507 if (a == next_a) {
508 q->tail = NULL; /* no more active slots */
509 return skb;
510 }
511 q->tail->next = next_a;
512 } else {
513 WRITE_ONCE(slot->allot, slot->allot - qdisc_pkt_len(skb));
514 }
515 return skb;
516 }
517
518 static void
sfq_reset(struct Qdisc * sch)519 sfq_reset(struct Qdisc *sch)
520 {
521 struct sk_buff *skb;
522
523 while ((skb = sfq_dequeue(sch)) != NULL)
524 rtnl_kfree_skbs(skb, skb);
525 }
526
527 /*
528 * When q->perturbation is changed, we rehash all queued skbs
529 * to avoid OOO (Out Of Order) effects.
530 * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
531 * counters.
532 */
sfq_rehash(struct Qdisc * sch)533 static void sfq_rehash(struct Qdisc *sch)
534 {
535 struct sfq_sched_data *q = qdisc_priv(sch);
536 struct sk_buff *skb;
537 int i;
538 struct sfq_slot *slot;
539 struct sk_buff_head list;
540 int dropped = 0;
541 unsigned int drop_len = 0;
542
543 __skb_queue_head_init(&list);
544
545 for (i = 0; i < q->maxflows; i++) {
546 slot = &q->slots[i];
547 if (!slot->qlen)
548 continue;
549 while (slot->qlen) {
550 skb = slot_dequeue_head(slot);
551 sfq_dec(q, i);
552 __skb_queue_tail(&list, skb);
553 }
554 WRITE_ONCE(slot->backlog, 0);
555 red_set_vars(&slot->vars);
556 WRITE_ONCE(q->ht[slot->hash], SFQ_EMPTY_SLOT);
557 }
558 q->tail = NULL;
559
560 while ((skb = __skb_dequeue(&list)) != NULL) {
561 unsigned int hash = sfq_hash(q, skb);
562 sfq_index x = q->ht[hash];
563
564 slot = &q->slots[x];
565 if (x == SFQ_EMPTY_SLOT) {
566 x = q->dep[0].next; /* get a free slot */
567 if (x >= SFQ_MAX_FLOWS) {
568 drop:
569 qdisc_qstats_backlog_dec(sch, skb);
570 drop_len += qdisc_pkt_len(skb);
571 kfree_skb(skb);
572 dropped++;
573 continue;
574 }
575 WRITE_ONCE(q->ht[hash], x);
576 slot = &q->slots[x];
577 slot->hash = hash;
578 }
579 if (slot->qlen >= q->maxdepth)
580 goto drop;
581 slot_queue_add(slot, skb);
582 if (q->red_parms)
583 slot->vars.qavg = red_calc_qavg(q->red_parms,
584 &slot->vars,
585 slot->backlog);
586 WRITE_ONCE(slot->backlog, slot->backlog + qdisc_pkt_len(skb));
587 sfq_inc(q, x);
588 if (slot->qlen == 1) { /* The flow is new */
589 if (q->tail == NULL) { /* It is the first flow */
590 slot->next = x;
591 } else {
592 slot->next = q->tail->next;
593 q->tail->next = x;
594 }
595 q->tail = slot;
596 WRITE_ONCE(slot->allot, q->quantum);
597 }
598 }
599 sch->q.qlen -= dropped;
600 qdisc_tree_reduce_backlog(sch, dropped, drop_len);
601 }
602
sfq_perturbation(struct timer_list * t)603 static void sfq_perturbation(struct timer_list *t)
604 {
605 struct sfq_sched_data *q = timer_container_of(q, t, perturb_timer);
606 struct Qdisc *sch = q->sch;
607 spinlock_t *root_lock;
608 siphash_key_t nkey;
609 int period;
610
611 get_random_bytes(&nkey, sizeof(nkey));
612 rcu_read_lock();
613 root_lock = qdisc_lock(qdisc_root_sleeping(sch));
614 spin_lock(root_lock);
615 q->perturbation = nkey;
616 if (!q->filter_list && q->tail)
617 sfq_rehash(sch);
618 spin_unlock(root_lock);
619
620 /* q->perturb_period can change under us from
621 * sfq_change() and sfq_destroy().
622 */
623 period = READ_ONCE(q->perturb_period);
624 if (period)
625 mod_timer(&q->perturb_timer, jiffies + period);
626 rcu_read_unlock();
627 }
628
sfq_change(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)629 static int sfq_change(struct Qdisc *sch, struct nlattr *opt,
630 struct netlink_ext_ack *extack)
631 {
632 struct sfq_sched_data *q = qdisc_priv(sch);
633 struct tc_sfq_qopt *ctl = nla_data(opt);
634 struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
635 unsigned int qlen, dropped = 0;
636 struct red_parms *p = NULL;
637 struct sk_buff *to_free = NULL;
638 struct sk_buff *tail = NULL;
639 unsigned int maxflows;
640 unsigned int quantum;
641 unsigned int divisor;
642 int perturb_period;
643 u8 headdrop;
644 u8 maxdepth;
645 int limit;
646 u8 flags;
647
648
649 if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
650 return -EINVAL;
651 if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
652 ctl_v1 = nla_data(opt);
653 if (ctl->divisor &&
654 (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
655 return -EINVAL;
656
657 if ((int)ctl->quantum < 0) {
658 NL_SET_ERR_MSG_MOD(extack, "invalid quantum");
659 return -EINVAL;
660 }
661
662 if (ctl->perturb_period < 0 ||
663 ctl->perturb_period > INT_MAX / HZ) {
664 NL_SET_ERR_MSG_MOD(extack, "invalid perturb period");
665 return -EINVAL;
666 }
667 perturb_period = ctl->perturb_period * HZ;
668
669 if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
670 ctl_v1->Wlog, ctl_v1->Scell_log, NULL))
671 return -EINVAL;
672 if (ctl_v1 && ctl_v1->qth_min) {
673 p = kmalloc_obj(*p);
674 if (!p)
675 return -ENOMEM;
676 }
677
678 sch_tree_lock(sch);
679
680 limit = q->limit;
681 divisor = q->divisor;
682 headdrop = q->headdrop;
683 maxdepth = q->maxdepth;
684 maxflows = q->maxflows;
685 quantum = q->quantum;
686 flags = q->flags;
687
688 /* update and validate configuration */
689 if (ctl->quantum)
690 quantum = ctl->quantum;
691 if (ctl->flows)
692 maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
693 if (ctl->divisor) {
694 divisor = ctl->divisor;
695 maxflows = min_t(u32, maxflows, divisor);
696 }
697 if (ctl_v1) {
698 if (ctl_v1->depth)
699 maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
700 if (p) {
701 red_set_parms(p,
702 ctl_v1->qth_min, ctl_v1->qth_max,
703 ctl_v1->Wlog,
704 ctl_v1->Plog, ctl_v1->Scell_log,
705 NULL,
706 ctl_v1->max_P);
707 }
708 flags = ctl_v1->flags;
709 headdrop = ctl_v1->headdrop;
710 }
711 if (ctl->limit) {
712 limit = min_t(u32, ctl->limit, maxdepth * maxflows);
713 maxflows = min_t(u32, maxflows, limit);
714 }
715 if (limit == 1) {
716 sch_tree_unlock(sch);
717 kfree(p);
718 NL_SET_ERR_MSG_MOD(extack, "invalid limit");
719 return -EINVAL;
720 }
721
722 /* commit configuration */
723 q->limit = limit;
724 q->divisor = divisor;
725 q->headdrop = headdrop;
726 q->maxdepth = maxdepth;
727 q->maxflows = maxflows;
728 WRITE_ONCE(q->perturb_period, perturb_period);
729 q->quantum = quantum;
730 q->flags = flags;
731 if (p)
732 swap(q->red_parms, p);
733
734 qlen = sch->q.qlen;
735 while (sch->q.qlen > q->limit) {
736 dropped += sfq_drop(sch, &to_free);
737 if (!tail)
738 tail = to_free;
739 }
740
741 rtnl_kfree_skbs(to_free, tail);
742 qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
743
744 timer_delete(&q->perturb_timer);
745 if (q->perturb_period) {
746 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
747 get_random_bytes(&q->perturbation, sizeof(q->perturbation));
748 }
749 sch_tree_unlock(sch);
750 kfree(p);
751 return 0;
752 }
753
sfq_alloc(size_t sz)754 static void *sfq_alloc(size_t sz)
755 {
756 return kvmalloc(sz, GFP_KERNEL);
757 }
758
sfq_free(void * addr)759 static void sfq_free(void *addr)
760 {
761 kvfree(addr);
762 }
763
sfq_destroy(struct Qdisc * sch)764 static void sfq_destroy(struct Qdisc *sch)
765 {
766 struct sfq_sched_data *q = qdisc_priv(sch);
767
768 tcf_block_put(q->block);
769 WRITE_ONCE(q->perturb_period, 0);
770 timer_delete_sync(&q->perturb_timer);
771 sfq_free(q->ht);
772 sfq_free(q->slots);
773 kfree(q->red_parms);
774 }
775
sfq_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)776 static int sfq_init(struct Qdisc *sch, struct nlattr *opt,
777 struct netlink_ext_ack *extack)
778 {
779 struct sfq_sched_data *q = qdisc_priv(sch);
780 int i;
781 int err;
782
783 q->sch = sch;
784 timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE);
785
786 err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
787 if (err)
788 return err;
789
790 for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
791 q->dep[i].next = i + SFQ_MAX_FLOWS;
792 q->dep[i].prev = i + SFQ_MAX_FLOWS;
793 }
794
795 q->limit = SFQ_MAX_DEPTH;
796 q->maxdepth = SFQ_MAX_DEPTH;
797 q->cur_depth = 0;
798 q->tail = NULL;
799 q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
800 q->maxflows = SFQ_DEFAULT_FLOWS;
801 q->quantum = psched_mtu(qdisc_dev(sch));
802 q->perturb_period = 0;
803 get_random_bytes(&q->perturbation, sizeof(q->perturbation));
804
805 if (opt) {
806 int err = sfq_change(sch, opt, extack);
807 if (err)
808 return err;
809 }
810
811 q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
812 q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
813 if (!q->ht || !q->slots) {
814 /* Note: sfq_destroy() will be called by our caller */
815 return -ENOMEM;
816 }
817
818 for (i = 0; i < q->divisor; i++)
819 q->ht[i] = SFQ_EMPTY_SLOT;
820
821 for (i = 0; i < q->maxflows; i++) {
822 slot_queue_init(&q->slots[i]);
823 sfq_link(q, i);
824 }
825 if (q->limit >= 1)
826 sch->flags |= TCQ_F_CAN_BYPASS;
827 else
828 sch->flags &= ~TCQ_F_CAN_BYPASS;
829 return 0;
830 }
831
sfq_dump(struct Qdisc * sch,struct sk_buff * skb)832 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
833 {
834 struct sfq_sched_data *q = qdisc_priv(sch);
835 unsigned char *b = skb_tail_pointer(skb);
836 struct tc_sfq_qopt_v1 opt;
837 struct red_parms *p = q->red_parms;
838
839 memset(&opt, 0, sizeof(opt));
840 opt.v0.quantum = q->quantum;
841 opt.v0.perturb_period = q->perturb_period / HZ;
842 opt.v0.limit = q->limit;
843 opt.v0.divisor = q->divisor;
844 opt.v0.flows = q->maxflows;
845 opt.depth = q->maxdepth;
846 opt.headdrop = q->headdrop;
847
848 if (p) {
849 opt.qth_min = p->qth_min >> p->Wlog;
850 opt.qth_max = p->qth_max >> p->Wlog;
851 opt.Wlog = p->Wlog;
852 opt.Plog = p->Plog;
853 opt.Scell_log = p->Scell_log;
854 opt.max_P = p->max_P;
855 }
856 memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
857 opt.flags = q->flags;
858
859 if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
860 goto nla_put_failure;
861
862 return skb->len;
863
864 nla_put_failure:
865 nlmsg_trim(skb, b);
866 return -1;
867 }
868
sfq_leaf(struct Qdisc * sch,unsigned long arg)869 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
870 {
871 return NULL;
872 }
873
sfq_find(struct Qdisc * sch,u32 classid)874 static unsigned long sfq_find(struct Qdisc *sch, u32 classid)
875 {
876 return 0;
877 }
878
sfq_bind(struct Qdisc * sch,unsigned long parent,u32 classid)879 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
880 u32 classid)
881 {
882 return 0;
883 }
884
sfq_unbind(struct Qdisc * q,unsigned long cl)885 static void sfq_unbind(struct Qdisc *q, unsigned long cl)
886 {
887 }
888
sfq_tcf_block(struct Qdisc * sch,unsigned long cl,struct netlink_ext_ack * extack)889 static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl,
890 struct netlink_ext_ack *extack)
891 {
892 struct sfq_sched_data *q = qdisc_priv(sch);
893
894 if (cl)
895 return NULL;
896 return q->block;
897 }
898
sfq_dump_class(struct Qdisc * sch,unsigned long cl,struct sk_buff * skb,struct tcmsg * tcm)899 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
900 struct sk_buff *skb, struct tcmsg *tcm)
901 {
902 tcm->tcm_handle |= TC_H_MIN(cl);
903 return 0;
904 }
905
sfq_dump_class_stats(struct Qdisc * sch,unsigned long cl,struct gnet_dump * d)906 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
907 struct gnet_dump *d)
908 {
909 struct sfq_sched_data *q = qdisc_priv(sch);
910 sfq_index idx = READ_ONCE(q->ht[cl - 1]);
911 struct gnet_stats_queue qs = { 0 };
912 struct tc_sfq_xstats xstats = { 0 };
913
914 if (idx != SFQ_EMPTY_SLOT) {
915 const struct sfq_slot *slot = &q->slots[idx];
916
917 xstats.allot = READ_ONCE(slot->allot);
918 qs.qlen = READ_ONCE(slot->qlen);
919 qs.backlog = READ_ONCE(slot->backlog);
920 }
921 if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
922 return -1;
923 return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
924 }
925
sfq_walk(struct Qdisc * sch,struct qdisc_walker * arg)926 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
927 {
928 struct sfq_sched_data *q = qdisc_priv(sch);
929 unsigned int i;
930
931 if (arg->stop)
932 return;
933
934 for (i = 0; i < q->divisor; i++) {
935 if (READ_ONCE(q->ht[i]) == SFQ_EMPTY_SLOT) {
936 arg->count++;
937 continue;
938 }
939 if (!tc_qdisc_stats_dump(sch, i + 1, arg))
940 break;
941 }
942 }
943
944 static const struct Qdisc_class_ops sfq_class_ops = {
945 .leaf = sfq_leaf,
946 .find = sfq_find,
947 .tcf_block = sfq_tcf_block,
948 .bind_tcf = sfq_bind,
949 .unbind_tcf = sfq_unbind,
950 .dump = sfq_dump_class,
951 .dump_stats = sfq_dump_class_stats,
952 .walk = sfq_walk,
953 };
954
955 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
956 .cl_ops = &sfq_class_ops,
957 .id = "sfq",
958 .priv_size = sizeof(struct sfq_sched_data),
959 .enqueue = sfq_enqueue,
960 .dequeue = sfq_dequeue,
961 .peek = qdisc_peek_dequeued,
962 .init = sfq_init,
963 .reset = sfq_reset,
964 .destroy = sfq_destroy,
965 .change = NULL,
966 .dump = sfq_dump,
967 .owner = THIS_MODULE,
968 };
969 MODULE_ALIAS_NET_SCH("sfq");
970
sfq_module_init(void)971 static int __init sfq_module_init(void)
972 {
973 return register_qdisc(&sfq_qdisc_ops);
974 }
sfq_module_exit(void)975 static void __exit sfq_module_exit(void)
976 {
977 unregister_qdisc(&sfq_qdisc_ops);
978 }
979 module_init(sfq_module_init)
980 module_exit(sfq_module_exit)
981 MODULE_LICENSE("GPL");
982 MODULE_DESCRIPTION("Stochastic Fairness qdisc");
983