xref: /linux/net/sched/sch_choke.c (revision d3aeb889dcbd78e95f500d383799a23d949796e0)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * net/sched/sch_choke.c	CHOKE scheduler
4  *
5  * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
6  * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
7  */
8 
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/skbuff.h>
13 #include <linux/vmalloc.h>
14 #include <net/pkt_sched.h>
15 #include <net/pkt_cls.h>
16 #include <net/inet_ecn.h>
17 #include <net/red.h>
18 #include <net/flow_dissector.h>
19 
20 /*
21    CHOKe stateless AQM for fair bandwidth allocation
22    =================================================
23 
24    CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
25    unresponsive flows) is a variant of RED that penalizes misbehaving flows but
26    maintains no flow state. The difference from RED is an additional step
27    during the enqueuing process. If average queue size is over the
28    low threshold (qmin), a packet is chosen at random from the queue.
29    If both the new and chosen packet are from the same flow, both
30    are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
31    needs to access packets in queue randomly. It has a minimal class
32    interface to allow overriding the builtin flow classifier with
33    filters.
34 
35    Source:
36    R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
37    Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
38    IEEE INFOCOM, 2000.
39 
40    A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
41    Characteristics", IEEE/ACM Transactions on Networking, 2004
42 
43  */
44 
45 /* Upper bound on size of sk_buff table (packets) */
46 #define CHOKE_MAX_QUEUE	(128*1024 - 1)
47 
48 struct choke_sched_data {
49 /* Parameters */
50 	u32		 limit;
51 	unsigned char	 flags;
52 
53 	struct red_parms parms;
54 
55 /* Variables */
56 	struct red_vars  vars;
57 	struct {
58 		u32	prob_drop;	/* Early probability drops */
59 		u32	prob_mark;	/* Early probability marks */
60 		u32	forced_drop;	/* Forced drops, qavg > max_thresh */
61 		u32	forced_mark;	/* Forced marks, qavg > max_thresh */
62 		u32	pdrop;          /* Drops due to queue limits */
63 		u32	matched;	/* Drops to flow match */
64 	} stats;
65 
66 	unsigned int	 head;
67 	unsigned int	 tail;
68 
69 	unsigned int	 tab_mask; /* size - 1 */
70 
71 	struct sk_buff **tab;
72 };
73 
74 /* number of elements in queue including holes */
75 static unsigned int choke_len(const struct choke_sched_data *q)
76 {
77 	return (q->tail - q->head) & q->tab_mask;
78 }
79 
80 /* Is ECN parameter configured */
81 static int use_ecn(const struct choke_sched_data *q)
82 {
83 	return q->flags & TC_RED_ECN;
84 }
85 
86 /* Should packets over max just be dropped (versus marked) */
87 static int use_harddrop(const struct choke_sched_data *q)
88 {
89 	return q->flags & TC_RED_HARDDROP;
90 }
91 
92 /* Move head pointer forward to skip over holes */
93 static void choke_zap_head_holes(struct choke_sched_data *q)
94 {
95 	do {
96 		q->head = (q->head + 1) & q->tab_mask;
97 		if (q->head == q->tail)
98 			break;
99 	} while (q->tab[q->head] == NULL);
100 }
101 
102 /* Move tail pointer backwards to reuse holes */
103 static void choke_zap_tail_holes(struct choke_sched_data *q)
104 {
105 	do {
106 		q->tail = (q->tail - 1) & q->tab_mask;
107 		if (q->head == q->tail)
108 			break;
109 	} while (q->tab[q->tail] == NULL);
110 }
111 
112 /* Drop packet from queue array by creating a "hole" */
113 static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx,
114 			      struct sk_buff **to_free)
115 {
116 	struct choke_sched_data *q = qdisc_priv(sch);
117 	struct sk_buff *skb = q->tab[idx];
118 
119 	q->tab[idx] = NULL;
120 
121 	if (idx == q->head)
122 		choke_zap_head_holes(q);
123 	if (idx == q->tail)
124 		choke_zap_tail_holes(q);
125 
126 	--sch->q.qlen;
127 	qdisc_qstats_backlog_dec(sch, skb);
128 	qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
129 	qdisc_drop(skb, sch, to_free);
130 }
131 
132 struct choke_skb_cb {
133 	u8			keys_valid;
134 	struct			flow_keys_digest keys;
135 };
136 
137 static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
138 {
139 	qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
140 	return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
141 }
142 
143 /*
144  * Compare flow of two packets
145  *  Returns true only if source and destination address and port match.
146  *          false for special cases
147  */
148 static bool choke_match_flow(struct sk_buff *skb1,
149 			     struct sk_buff *skb2)
150 {
151 	struct flow_keys temp;
152 
153 	if (skb1->protocol != skb2->protocol)
154 		return false;
155 
156 	if (!choke_skb_cb(skb1)->keys_valid) {
157 		choke_skb_cb(skb1)->keys_valid = 1;
158 		skb_flow_dissect_flow_keys(skb1, &temp, 0);
159 		make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp);
160 	}
161 
162 	if (!choke_skb_cb(skb2)->keys_valid) {
163 		choke_skb_cb(skb2)->keys_valid = 1;
164 		skb_flow_dissect_flow_keys(skb2, &temp, 0);
165 		make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp);
166 	}
167 
168 	return !memcmp(&choke_skb_cb(skb1)->keys,
169 		       &choke_skb_cb(skb2)->keys,
170 		       sizeof(choke_skb_cb(skb1)->keys));
171 }
172 
173 /*
174  * Select a packet at random from queue
175  * HACK: since queue can have holes from previous deletion; retry several
176  *   times to find a random skb but then just give up and return the head
177  * Will return NULL if queue is empty (q->head == q->tail)
178  */
179 static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
180 					 unsigned int *pidx)
181 {
182 	struct sk_buff *skb;
183 	int retrys = 3;
184 
185 	do {
186 		*pidx = (q->head + get_random_u32_below(choke_len(q))) & q->tab_mask;
187 		skb = q->tab[*pidx];
188 		if (skb)
189 			return skb;
190 	} while (--retrys > 0);
191 
192 	return q->tab[*pidx = q->head];
193 }
194 
195 /*
196  * Compare new packet with random packet in queue
197  * returns true if matched and sets *pidx
198  */
199 static bool choke_match_random(const struct choke_sched_data *q,
200 			       struct sk_buff *nskb,
201 			       unsigned int *pidx)
202 {
203 	struct sk_buff *oskb;
204 
205 	if (q->head == q->tail)
206 		return false;
207 
208 	oskb = choke_peek_random(q, pidx);
209 	return choke_match_flow(oskb, nskb);
210 }
211 
212 static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch,
213 			 struct sk_buff **to_free)
214 {
215 	struct choke_sched_data *q = qdisc_priv(sch);
216 	const struct red_parms *p = &q->parms;
217 
218 	choke_skb_cb(skb)->keys_valid = 0;
219 	/* Compute average queue usage (see RED) */
220 	q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
221 	if (red_is_idling(&q->vars))
222 		red_end_of_idle_period(&q->vars);
223 
224 	/* Is queue small? */
225 	if (q->vars.qavg <= p->qth_min)
226 		q->vars.qcount = -1;
227 	else {
228 		unsigned int idx;
229 
230 		/* Draw a packet at random from queue and compare flow */
231 		if (choke_match_random(q, skb, &idx)) {
232 			WRITE_ONCE(q->stats.matched, q->stats.matched + 1);
233 			choke_drop_by_idx(sch, idx, to_free);
234 			goto congestion_drop;
235 		}
236 
237 		/* Queue is large, always mark/drop */
238 		if (q->vars.qavg > p->qth_max) {
239 			q->vars.qcount = -1;
240 
241 			qdisc_qstats_overlimit(sch);
242 			if (use_harddrop(q) || !use_ecn(q) ||
243 			    !INET_ECN_set_ce(skb)) {
244 				WRITE_ONCE(q->stats.forced_drop,
245 					   q->stats.forced_drop + 1);
246 				goto congestion_drop;
247 			}
248 
249 			WRITE_ONCE(q->stats.forced_mark,
250 				   q->stats.forced_mark + 1);
251 		} else if (++q->vars.qcount) {
252 			if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
253 				q->vars.qcount = 0;
254 				q->vars.qR = red_random(p);
255 
256 				qdisc_qstats_overlimit(sch);
257 				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
258 					WRITE_ONCE(q->stats.prob_drop,
259 					           q->stats.prob_drop + 1);
260 					goto congestion_drop;
261 				}
262 
263 				WRITE_ONCE(q->stats.prob_mark,
264 					   q->stats.prob_mark + 1);
265 			}
266 		} else
267 			q->vars.qR = red_random(p);
268 	}
269 
270 	/* Admit new packet */
271 	if (sch->q.qlen < q->limit) {
272 		q->tab[q->tail] = skb;
273 		q->tail = (q->tail + 1) & q->tab_mask;
274 		++sch->q.qlen;
275 		qdisc_qstats_backlog_inc(sch, skb);
276 		return NET_XMIT_SUCCESS;
277 	}
278 
279 	WRITE_ONCE(q->stats.pdrop, q->stats.pdrop + 1);
280 	return qdisc_drop(skb, sch, to_free);
281 
282 congestion_drop:
283 	qdisc_drop(skb, sch, to_free);
284 	return NET_XMIT_CN;
285 }
286 
287 static struct sk_buff *choke_dequeue(struct Qdisc *sch)
288 {
289 	struct choke_sched_data *q = qdisc_priv(sch);
290 	struct sk_buff *skb;
291 
292 	if (q->head == q->tail) {
293 		if (!red_is_idling(&q->vars))
294 			red_start_of_idle_period(&q->vars);
295 		return NULL;
296 	}
297 
298 	skb = q->tab[q->head];
299 	q->tab[q->head] = NULL;
300 	choke_zap_head_holes(q);
301 	--sch->q.qlen;
302 	qdisc_qstats_backlog_dec(sch, skb);
303 	qdisc_bstats_update(sch, skb);
304 
305 	return skb;
306 }
307 
308 static void choke_reset(struct Qdisc *sch)
309 {
310 	struct choke_sched_data *q = qdisc_priv(sch);
311 
312 	while (q->head != q->tail) {
313 		struct sk_buff *skb = q->tab[q->head];
314 
315 		q->head = (q->head + 1) & q->tab_mask;
316 		if (!skb)
317 			continue;
318 		rtnl_qdisc_drop(skb, sch);
319 	}
320 
321 	if (q->tab)
322 		memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *));
323 	q->head = q->tail = 0;
324 	red_restart(&q->vars);
325 }
326 
327 static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
328 	[TCA_CHOKE_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
329 	[TCA_CHOKE_STAB]	= { .len = RED_STAB_SIZE },
330 	[TCA_CHOKE_MAX_P]	= { .type = NLA_U32 },
331 };
332 
333 
334 static void choke_free(void *addr)
335 {
336 	kvfree(addr);
337 }
338 
339 static int choke_change(struct Qdisc *sch, struct nlattr *opt,
340 			struct netlink_ext_ack *extack)
341 {
342 	struct choke_sched_data *q = qdisc_priv(sch);
343 	struct nlattr *tb[TCA_CHOKE_MAX + 1];
344 	const struct tc_red_qopt *ctl;
345 	int err;
346 	struct sk_buff **old = NULL;
347 	unsigned int mask;
348 	u32 max_P;
349 	u8 *stab;
350 
351 	if (opt == NULL)
352 		return -EINVAL;
353 
354 	err = nla_parse_nested_deprecated(tb, TCA_CHOKE_MAX, opt,
355 					  choke_policy, NULL);
356 	if (err < 0)
357 		return err;
358 
359 	if (tb[TCA_CHOKE_PARMS] == NULL ||
360 	    tb[TCA_CHOKE_STAB] == NULL)
361 		return -EINVAL;
362 
363 	max_P = nla_get_u32_default(tb[TCA_CHOKE_MAX_P], 0);
364 
365 	ctl = nla_data(tb[TCA_CHOKE_PARMS]);
366 	stab = nla_data(tb[TCA_CHOKE_STAB]);
367 	if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log, stab))
368 		return -EINVAL;
369 
370 	if (ctl->limit > CHOKE_MAX_QUEUE)
371 		return -EINVAL;
372 
373 	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
374 	if (mask != q->tab_mask) {
375 		struct sk_buff **ntab;
376 
377 		ntab = kvzalloc_objs(struct sk_buff *, mask + 1);
378 		if (!ntab)
379 			return -ENOMEM;
380 
381 		sch_tree_lock(sch);
382 		old = q->tab;
383 		if (old) {
384 			unsigned int oqlen = sch->q.qlen, tail = 0;
385 			unsigned dropped = 0;
386 
387 			while (q->head != q->tail) {
388 				struct sk_buff *skb = q->tab[q->head];
389 
390 				q->head = (q->head + 1) & q->tab_mask;
391 				if (!skb)
392 					continue;
393 				if (tail < mask) {
394 					ntab[tail++] = skb;
395 					continue;
396 				}
397 				dropped += qdisc_pkt_len(skb);
398 				qdisc_qstats_backlog_dec(sch, skb);
399 				--sch->q.qlen;
400 				rtnl_qdisc_drop(skb, sch);
401 			}
402 			qdisc_tree_reduce_backlog(sch, oqlen - sch->q.qlen, dropped);
403 			q->head = 0;
404 			q->tail = tail;
405 		}
406 
407 		q->tab_mask = mask;
408 		q->tab = ntab;
409 	} else
410 		sch_tree_lock(sch);
411 
412 	WRITE_ONCE(q->flags, ctl->flags);
413 	WRITE_ONCE(q->limit, ctl->limit);
414 
415 	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
416 		      ctl->Plog, ctl->Scell_log,
417 		      stab,
418 		      max_P);
419 	red_set_vars(&q->vars);
420 
421 	if (q->head == q->tail)
422 		red_end_of_idle_period(&q->vars);
423 
424 	sch_tree_unlock(sch);
425 	choke_free(old);
426 	return 0;
427 }
428 
429 static int choke_init(struct Qdisc *sch, struct nlattr *opt,
430 		      struct netlink_ext_ack *extack)
431 {
432 	return choke_change(sch, opt, extack);
433 }
434 
435 static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
436 {
437 	struct choke_sched_data *q = qdisc_priv(sch);
438 	u8 Wlog = READ_ONCE(q->parms.Wlog);
439 	struct nlattr *opts = NULL;
440 	struct tc_red_qopt opt = {
441 		.limit		= READ_ONCE(q->limit),
442 		.flags		= READ_ONCE(q->flags),
443 		.qth_min	= READ_ONCE(q->parms.qth_min) >> Wlog,
444 		.qth_max	= READ_ONCE(q->parms.qth_max) >> Wlog,
445 		.Wlog		= Wlog,
446 		.Plog		= READ_ONCE(q->parms.Plog),
447 		.Scell_log	= READ_ONCE(q->parms.Scell_log),
448 	};
449 
450 	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
451 	if (opts == NULL)
452 		goto nla_put_failure;
453 
454 	if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
455 	    nla_put_u32(skb, TCA_CHOKE_MAX_P, READ_ONCE(q->parms.max_P)))
456 		goto nla_put_failure;
457 	return nla_nest_end(skb, opts);
458 
459 nla_put_failure:
460 	nla_nest_cancel(skb, opts);
461 	return -EMSGSIZE;
462 }
463 
464 static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
465 {
466 	struct choke_sched_data *q = qdisc_priv(sch);
467 	struct tc_choke_xstats st = {
468 		.early	= READ_ONCE(q->stats.prob_drop) +
469 			  READ_ONCE(q->stats.forced_drop),
470 		.marked	= READ_ONCE(q->stats.prob_mark) +
471 			  READ_ONCE(q->stats.forced_mark),
472 		.pdrop	= READ_ONCE(q->stats.pdrop),
473 		.matched = READ_ONCE(q->stats.matched),
474 	};
475 
476 	return gnet_stats_copy_app(d, &st, sizeof(st));
477 }
478 
479 static void choke_destroy(struct Qdisc *sch)
480 {
481 	struct choke_sched_data *q = qdisc_priv(sch);
482 
483 	choke_free(q->tab);
484 }
485 
486 static struct sk_buff *choke_peek_head(struct Qdisc *sch)
487 {
488 	struct choke_sched_data *q = qdisc_priv(sch);
489 
490 	return (q->head != q->tail) ? q->tab[q->head] : NULL;
491 }
492 
493 static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
494 	.id		=	"choke",
495 	.priv_size	=	sizeof(struct choke_sched_data),
496 
497 	.enqueue	=	choke_enqueue,
498 	.dequeue	=	choke_dequeue,
499 	.peek		=	choke_peek_head,
500 	.init		=	choke_init,
501 	.destroy	=	choke_destroy,
502 	.reset		=	choke_reset,
503 	.change		=	choke_change,
504 	.dump		=	choke_dump,
505 	.dump_stats	=	choke_dump_stats,
506 	.owner		=	THIS_MODULE,
507 };
508 MODULE_ALIAS_NET_SCH("choke");
509 
510 static int __init choke_module_init(void)
511 {
512 	return register_qdisc(&choke_qdisc_ops);
513 }
514 
515 static void __exit choke_module_exit(void)
516 {
517 	unregister_qdisc(&choke_qdisc_ops);
518 }
519 
520 module_init(choke_module_init)
521 module_exit(choke_module_exit)
522 
523 MODULE_LICENSE("GPL");
524 MODULE_DESCRIPTION("Choose and keep responsive flows scheduler");
525