xref: /linux/net/sched/sch_choke.c (revision b43ab901d671e3e3cad425ea5e9a3c74e266dcdd)
1 /*
2  * net/sched/sch_choke.c	CHOKE scheduler
3  *
4  * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
5  * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * version 2 as published by the Free Software Foundation.
10  *
11  */
12 
13 #include <linux/module.h>
14 #include <linux/types.h>
15 #include <linux/kernel.h>
16 #include <linux/skbuff.h>
17 #include <linux/reciprocal_div.h>
18 #include <linux/vmalloc.h>
19 #include <net/pkt_sched.h>
20 #include <net/inet_ecn.h>
21 #include <net/red.h>
22 #include <net/flow_keys.h>
23 
24 /*
25    CHOKe stateless AQM for fair bandwidth allocation
26    =================================================
27 
28    CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
29    unresponsive flows) is a variant of RED that penalizes misbehaving flows but
30    maintains no flow state. The difference from RED is an additional step
31    during the enqueuing process. If average queue size is over the
32    low threshold (qmin), a packet is chosen at random from the queue.
33    If both the new and chosen packet are from the same flow, both
34    are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
35    needs to access packets in queue randomly. It has a minimal class
36    interface to allow overriding the builtin flow classifier with
37    filters.
38 
39    Source:
40    R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
41    Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
42    IEEE INFOCOM, 2000.
43 
44    A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
45    Characteristics", IEEE/ACM Transactions on Networking, 2004
46 
47  */
48 
49 /* Upper bound on size of sk_buff table (packets) */
50 #define CHOKE_MAX_QUEUE	(128*1024 - 1)
51 
52 struct choke_sched_data {
53 /* Parameters */
54 	u32		 limit;
55 	unsigned char	 flags;
56 
57 	struct red_parms parms;
58 
59 /* Variables */
60 	struct red_vars  vars;
61 	struct tcf_proto *filter_list;
62 	struct {
63 		u32	prob_drop;	/* Early probability drops */
64 		u32	prob_mark;	/* Early probability marks */
65 		u32	forced_drop;	/* Forced drops, qavg > max_thresh */
66 		u32	forced_mark;	/* Forced marks, qavg > max_thresh */
67 		u32	pdrop;          /* Drops due to queue limits */
68 		u32	other;          /* Drops due to drop() calls */
69 		u32	matched;	/* Drops to flow match */
70 	} stats;
71 
72 	unsigned int	 head;
73 	unsigned int	 tail;
74 
75 	unsigned int	 tab_mask; /* size - 1 */
76 
77 	struct sk_buff **tab;
78 };
79 
80 /* deliver a random number between 0 and N - 1 */
81 static u32 random_N(unsigned int N)
82 {
83 	return reciprocal_divide(random32(), N);
84 }
85 
86 /* number of elements in queue including holes */
87 static unsigned int choke_len(const struct choke_sched_data *q)
88 {
89 	return (q->tail - q->head) & q->tab_mask;
90 }
91 
92 /* Is ECN parameter configured */
93 static int use_ecn(const struct choke_sched_data *q)
94 {
95 	return q->flags & TC_RED_ECN;
96 }
97 
98 /* Should packets over max just be dropped (versus marked) */
99 static int use_harddrop(const struct choke_sched_data *q)
100 {
101 	return q->flags & TC_RED_HARDDROP;
102 }
103 
104 /* Move head pointer forward to skip over holes */
105 static void choke_zap_head_holes(struct choke_sched_data *q)
106 {
107 	do {
108 		q->head = (q->head + 1) & q->tab_mask;
109 		if (q->head == q->tail)
110 			break;
111 	} while (q->tab[q->head] == NULL);
112 }
113 
114 /* Move tail pointer backwards to reuse holes */
115 static void choke_zap_tail_holes(struct choke_sched_data *q)
116 {
117 	do {
118 		q->tail = (q->tail - 1) & q->tab_mask;
119 		if (q->head == q->tail)
120 			break;
121 	} while (q->tab[q->tail] == NULL);
122 }
123 
124 /* Drop packet from queue array by creating a "hole" */
125 static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
126 {
127 	struct choke_sched_data *q = qdisc_priv(sch);
128 	struct sk_buff *skb = q->tab[idx];
129 
130 	q->tab[idx] = NULL;
131 
132 	if (idx == q->head)
133 		choke_zap_head_holes(q);
134 	if (idx == q->tail)
135 		choke_zap_tail_holes(q);
136 
137 	sch->qstats.backlog -= qdisc_pkt_len(skb);
138 	qdisc_drop(skb, sch);
139 	qdisc_tree_decrease_qlen(sch, 1);
140 	--sch->q.qlen;
141 }
142 
143 struct choke_skb_cb {
144 	u16			classid;
145 	u8			keys_valid;
146 	struct flow_keys	keys;
147 };
148 
149 static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
150 {
151 	BUILD_BUG_ON(sizeof(skb->cb) <
152 		sizeof(struct qdisc_skb_cb) + sizeof(struct choke_skb_cb));
153 	return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
154 }
155 
156 static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
157 {
158 	choke_skb_cb(skb)->classid = classid;
159 }
160 
161 static u16 choke_get_classid(const struct sk_buff *skb)
162 {
163 	return choke_skb_cb(skb)->classid;
164 }
165 
166 /*
167  * Compare flow of two packets
168  *  Returns true only if source and destination address and port match.
169  *          false for special cases
170  */
171 static bool choke_match_flow(struct sk_buff *skb1,
172 			     struct sk_buff *skb2)
173 {
174 	if (skb1->protocol != skb2->protocol)
175 		return false;
176 
177 	if (!choke_skb_cb(skb1)->keys_valid) {
178 		choke_skb_cb(skb1)->keys_valid = 1;
179 		skb_flow_dissect(skb1, &choke_skb_cb(skb1)->keys);
180 	}
181 
182 	if (!choke_skb_cb(skb2)->keys_valid) {
183 		choke_skb_cb(skb2)->keys_valid = 1;
184 		skb_flow_dissect(skb2, &choke_skb_cb(skb2)->keys);
185 	}
186 
187 	return !memcmp(&choke_skb_cb(skb1)->keys,
188 		       &choke_skb_cb(skb2)->keys,
189 		       sizeof(struct flow_keys));
190 }
191 
192 /*
193  * Classify flow using either:
194  *  1. pre-existing classification result in skb
195  *  2. fast internal classification
196  *  3. use TC filter based classification
197  */
198 static bool choke_classify(struct sk_buff *skb,
199 			   struct Qdisc *sch, int *qerr)
200 
201 {
202 	struct choke_sched_data *q = qdisc_priv(sch);
203 	struct tcf_result res;
204 	int result;
205 
206 	result = tc_classify(skb, q->filter_list, &res);
207 	if (result >= 0) {
208 #ifdef CONFIG_NET_CLS_ACT
209 		switch (result) {
210 		case TC_ACT_STOLEN:
211 		case TC_ACT_QUEUED:
212 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
213 		case TC_ACT_SHOT:
214 			return false;
215 		}
216 #endif
217 		choke_set_classid(skb, TC_H_MIN(res.classid));
218 		return true;
219 	}
220 
221 	return false;
222 }
223 
224 /*
225  * Select a packet at random from queue
226  * HACK: since queue can have holes from previous deletion; retry several
227  *   times to find a random skb but then just give up and return the head
228  * Will return NULL if queue is empty (q->head == q->tail)
229  */
230 static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
231 					 unsigned int *pidx)
232 {
233 	struct sk_buff *skb;
234 	int retrys = 3;
235 
236 	do {
237 		*pidx = (q->head + random_N(choke_len(q))) & q->tab_mask;
238 		skb = q->tab[*pidx];
239 		if (skb)
240 			return skb;
241 	} while (--retrys > 0);
242 
243 	return q->tab[*pidx = q->head];
244 }
245 
246 /*
247  * Compare new packet with random packet in queue
248  * returns true if matched and sets *pidx
249  */
250 static bool choke_match_random(const struct choke_sched_data *q,
251 			       struct sk_buff *nskb,
252 			       unsigned int *pidx)
253 {
254 	struct sk_buff *oskb;
255 
256 	if (q->head == q->tail)
257 		return false;
258 
259 	oskb = choke_peek_random(q, pidx);
260 	if (q->filter_list)
261 		return choke_get_classid(nskb) == choke_get_classid(oskb);
262 
263 	return choke_match_flow(oskb, nskb);
264 }
265 
266 static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
267 {
268 	struct choke_sched_data *q = qdisc_priv(sch);
269 	const struct red_parms *p = &q->parms;
270 	int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
271 
272 	if (q->filter_list) {
273 		/* If using external classifiers, get result and record it. */
274 		if (!choke_classify(skb, sch, &ret))
275 			goto other_drop;	/* Packet was eaten by filter */
276 	}
277 
278 	choke_skb_cb(skb)->keys_valid = 0;
279 	/* Compute average queue usage (see RED) */
280 	q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
281 	if (red_is_idling(&q->vars))
282 		red_end_of_idle_period(&q->vars);
283 
284 	/* Is queue small? */
285 	if (q->vars.qavg <= p->qth_min)
286 		q->vars.qcount = -1;
287 	else {
288 		unsigned int idx;
289 
290 		/* Draw a packet at random from queue and compare flow */
291 		if (choke_match_random(q, skb, &idx)) {
292 			q->stats.matched++;
293 			choke_drop_by_idx(sch, idx);
294 			goto congestion_drop;
295 		}
296 
297 		/* Queue is large, always mark/drop */
298 		if (q->vars.qavg > p->qth_max) {
299 			q->vars.qcount = -1;
300 
301 			sch->qstats.overlimits++;
302 			if (use_harddrop(q) || !use_ecn(q) ||
303 			    !INET_ECN_set_ce(skb)) {
304 				q->stats.forced_drop++;
305 				goto congestion_drop;
306 			}
307 
308 			q->stats.forced_mark++;
309 		} else if (++q->vars.qcount) {
310 			if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
311 				q->vars.qcount = 0;
312 				q->vars.qR = red_random(p);
313 
314 				sch->qstats.overlimits++;
315 				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
316 					q->stats.prob_drop++;
317 					goto congestion_drop;
318 				}
319 
320 				q->stats.prob_mark++;
321 			}
322 		} else
323 			q->vars.qR = red_random(p);
324 	}
325 
326 	/* Admit new packet */
327 	if (sch->q.qlen < q->limit) {
328 		q->tab[q->tail] = skb;
329 		q->tail = (q->tail + 1) & q->tab_mask;
330 		++sch->q.qlen;
331 		sch->qstats.backlog += qdisc_pkt_len(skb);
332 		return NET_XMIT_SUCCESS;
333 	}
334 
335 	q->stats.pdrop++;
336 	sch->qstats.drops++;
337 	kfree_skb(skb);
338 	return NET_XMIT_DROP;
339 
340  congestion_drop:
341 	qdisc_drop(skb, sch);
342 	return NET_XMIT_CN;
343 
344  other_drop:
345 	if (ret & __NET_XMIT_BYPASS)
346 		sch->qstats.drops++;
347 	kfree_skb(skb);
348 	return ret;
349 }
350 
351 static struct sk_buff *choke_dequeue(struct Qdisc *sch)
352 {
353 	struct choke_sched_data *q = qdisc_priv(sch);
354 	struct sk_buff *skb;
355 
356 	if (q->head == q->tail) {
357 		if (!red_is_idling(&q->vars))
358 			red_start_of_idle_period(&q->vars);
359 		return NULL;
360 	}
361 
362 	skb = q->tab[q->head];
363 	q->tab[q->head] = NULL;
364 	choke_zap_head_holes(q);
365 	--sch->q.qlen;
366 	sch->qstats.backlog -= qdisc_pkt_len(skb);
367 	qdisc_bstats_update(sch, skb);
368 
369 	return skb;
370 }
371 
372 static unsigned int choke_drop(struct Qdisc *sch)
373 {
374 	struct choke_sched_data *q = qdisc_priv(sch);
375 	unsigned int len;
376 
377 	len = qdisc_queue_drop(sch);
378 	if (len > 0)
379 		q->stats.other++;
380 	else {
381 		if (!red_is_idling(&q->vars))
382 			red_start_of_idle_period(&q->vars);
383 	}
384 
385 	return len;
386 }
387 
388 static void choke_reset(struct Qdisc *sch)
389 {
390 	struct choke_sched_data *q = qdisc_priv(sch);
391 
392 	red_restart(&q->vars);
393 }
394 
395 static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
396 	[TCA_CHOKE_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
397 	[TCA_CHOKE_STAB]	= { .len = RED_STAB_SIZE },
398 	[TCA_CHOKE_MAX_P]	= { .type = NLA_U32 },
399 };
400 
401 
402 static void choke_free(void *addr)
403 {
404 	if (addr) {
405 		if (is_vmalloc_addr(addr))
406 			vfree(addr);
407 		else
408 			kfree(addr);
409 	}
410 }
411 
412 static int choke_change(struct Qdisc *sch, struct nlattr *opt)
413 {
414 	struct choke_sched_data *q = qdisc_priv(sch);
415 	struct nlattr *tb[TCA_CHOKE_MAX + 1];
416 	const struct tc_red_qopt *ctl;
417 	int err;
418 	struct sk_buff **old = NULL;
419 	unsigned int mask;
420 	u32 max_P;
421 
422 	if (opt == NULL)
423 		return -EINVAL;
424 
425 	err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
426 	if (err < 0)
427 		return err;
428 
429 	if (tb[TCA_CHOKE_PARMS] == NULL ||
430 	    tb[TCA_CHOKE_STAB] == NULL)
431 		return -EINVAL;
432 
433 	max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
434 
435 	ctl = nla_data(tb[TCA_CHOKE_PARMS]);
436 
437 	if (ctl->limit > CHOKE_MAX_QUEUE)
438 		return -EINVAL;
439 
440 	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
441 	if (mask != q->tab_mask) {
442 		struct sk_buff **ntab;
443 
444 		ntab = kcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL);
445 		if (!ntab)
446 			ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
447 		if (!ntab)
448 			return -ENOMEM;
449 
450 		sch_tree_lock(sch);
451 		old = q->tab;
452 		if (old) {
453 			unsigned int oqlen = sch->q.qlen, tail = 0;
454 
455 			while (q->head != q->tail) {
456 				struct sk_buff *skb = q->tab[q->head];
457 
458 				q->head = (q->head + 1) & q->tab_mask;
459 				if (!skb)
460 					continue;
461 				if (tail < mask) {
462 					ntab[tail++] = skb;
463 					continue;
464 				}
465 				sch->qstats.backlog -= qdisc_pkt_len(skb);
466 				--sch->q.qlen;
467 				qdisc_drop(skb, sch);
468 			}
469 			qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen);
470 			q->head = 0;
471 			q->tail = tail;
472 		}
473 
474 		q->tab_mask = mask;
475 		q->tab = ntab;
476 	} else
477 		sch_tree_lock(sch);
478 
479 	q->flags = ctl->flags;
480 	q->limit = ctl->limit;
481 
482 	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
483 		      ctl->Plog, ctl->Scell_log,
484 		      nla_data(tb[TCA_CHOKE_STAB]),
485 		      max_P);
486 	red_set_vars(&q->vars);
487 
488 	if (q->head == q->tail)
489 		red_end_of_idle_period(&q->vars);
490 
491 	sch_tree_unlock(sch);
492 	choke_free(old);
493 	return 0;
494 }
495 
496 static int choke_init(struct Qdisc *sch, struct nlattr *opt)
497 {
498 	return choke_change(sch, opt);
499 }
500 
501 static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
502 {
503 	struct choke_sched_data *q = qdisc_priv(sch);
504 	struct nlattr *opts = NULL;
505 	struct tc_red_qopt opt = {
506 		.limit		= q->limit,
507 		.flags		= q->flags,
508 		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
509 		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
510 		.Wlog		= q->parms.Wlog,
511 		.Plog		= q->parms.Plog,
512 		.Scell_log	= q->parms.Scell_log,
513 	};
514 
515 	opts = nla_nest_start(skb, TCA_OPTIONS);
516 	if (opts == NULL)
517 		goto nla_put_failure;
518 
519 	NLA_PUT(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt);
520 	NLA_PUT_U32(skb, TCA_CHOKE_MAX_P, q->parms.max_P);
521 	return nla_nest_end(skb, opts);
522 
523 nla_put_failure:
524 	nla_nest_cancel(skb, opts);
525 	return -EMSGSIZE;
526 }
527 
528 static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
529 {
530 	struct choke_sched_data *q = qdisc_priv(sch);
531 	struct tc_choke_xstats st = {
532 		.early	= q->stats.prob_drop + q->stats.forced_drop,
533 		.marked	= q->stats.prob_mark + q->stats.forced_mark,
534 		.pdrop	= q->stats.pdrop,
535 		.other	= q->stats.other,
536 		.matched = q->stats.matched,
537 	};
538 
539 	return gnet_stats_copy_app(d, &st, sizeof(st));
540 }
541 
542 static void choke_destroy(struct Qdisc *sch)
543 {
544 	struct choke_sched_data *q = qdisc_priv(sch);
545 
546 	tcf_destroy_chain(&q->filter_list);
547 	choke_free(q->tab);
548 }
549 
550 static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg)
551 {
552 	return NULL;
553 }
554 
555 static unsigned long choke_get(struct Qdisc *sch, u32 classid)
556 {
557 	return 0;
558 }
559 
560 static void choke_put(struct Qdisc *q, unsigned long cl)
561 {
562 }
563 
564 static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
565 				u32 classid)
566 {
567 	return 0;
568 }
569 
570 static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl)
571 {
572 	struct choke_sched_data *q = qdisc_priv(sch);
573 
574 	if (cl)
575 		return NULL;
576 	return &q->filter_list;
577 }
578 
579 static int choke_dump_class(struct Qdisc *sch, unsigned long cl,
580 			  struct sk_buff *skb, struct tcmsg *tcm)
581 {
582 	tcm->tcm_handle |= TC_H_MIN(cl);
583 	return 0;
584 }
585 
586 static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg)
587 {
588 	if (!arg->stop) {
589 		if (arg->fn(sch, 1, arg) < 0) {
590 			arg->stop = 1;
591 			return;
592 		}
593 		arg->count++;
594 	}
595 }
596 
597 static const struct Qdisc_class_ops choke_class_ops = {
598 	.leaf		=	choke_leaf,
599 	.get		=	choke_get,
600 	.put		=	choke_put,
601 	.tcf_chain	=	choke_find_tcf,
602 	.bind_tcf	=	choke_bind,
603 	.unbind_tcf	=	choke_put,
604 	.dump		=	choke_dump_class,
605 	.walk		=	choke_walk,
606 };
607 
608 static struct sk_buff *choke_peek_head(struct Qdisc *sch)
609 {
610 	struct choke_sched_data *q = qdisc_priv(sch);
611 
612 	return (q->head != q->tail) ? q->tab[q->head] : NULL;
613 }
614 
615 static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
616 	.id		=	"choke",
617 	.priv_size	=	sizeof(struct choke_sched_data),
618 
619 	.enqueue	=	choke_enqueue,
620 	.dequeue	=	choke_dequeue,
621 	.peek		=	choke_peek_head,
622 	.drop		=	choke_drop,
623 	.init		=	choke_init,
624 	.destroy	=	choke_destroy,
625 	.reset		=	choke_reset,
626 	.change		=	choke_change,
627 	.dump		=	choke_dump,
628 	.dump_stats	=	choke_dump_stats,
629 	.owner		=	THIS_MODULE,
630 };
631 
632 static int __init choke_module_init(void)
633 {
634 	return register_qdisc(&choke_qdisc_ops);
635 }
636 
637 static void __exit choke_module_exit(void)
638 {
639 	unregister_qdisc(&choke_qdisc_ops);
640 }
641 
642 module_init(choke_module_init)
643 module_exit(choke_module_exit)
644 
645 MODULE_LICENSE("GPL");
646