xref: /linux/net/sched/sch_choke.c (revision e5c86679d5e864947a52fb31e45a425dea3e7fa9)
1 /*
2  * net/sched/sch_choke.c	CHOKE scheduler
3  *
4  * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
5  * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * version 2 as published by the Free Software Foundation.
10  *
11  */
12 
13 #include <linux/module.h>
14 #include <linux/types.h>
15 #include <linux/kernel.h>
16 #include <linux/skbuff.h>
17 #include <linux/vmalloc.h>
18 #include <net/pkt_sched.h>
19 #include <net/pkt_cls.h>
20 #include <net/inet_ecn.h>
21 #include <net/red.h>
22 #include <net/flow_dissector.h>
23 
24 /*
25    CHOKe stateless AQM for fair bandwidth allocation
26    =================================================
27 
28    CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
29    unresponsive flows) is a variant of RED that penalizes misbehaving flows but
30    maintains no flow state. The difference from RED is an additional step
31    during the enqueuing process. If average queue size is over the
32    low threshold (qmin), a packet is chosen at random from the queue.
33    If both the new and chosen packet are from the same flow, both
34    are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
35    needs to access packets in queue randomly. It has a minimal class
36    interface to allow overriding the builtin flow classifier with
37    filters.
38 
39    Source:
40    R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
41    Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
42    IEEE INFOCOM, 2000.
43 
44    A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
45    Characteristics", IEEE/ACM Transactions on Networking, 2004
46 
47  */
48 
49 /* Upper bound on size of sk_buff table (packets) */
50 #define CHOKE_MAX_QUEUE	(128*1024 - 1)
51 
52 struct choke_sched_data {
53 /* Parameters */
54 	u32		 limit;
55 	unsigned char	 flags;
56 
57 	struct red_parms parms;
58 
59 /* Variables */
60 	struct red_vars  vars;
61 	struct tcf_proto __rcu *filter_list;
62 	struct {
63 		u32	prob_drop;	/* Early probability drops */
64 		u32	prob_mark;	/* Early probability marks */
65 		u32	forced_drop;	/* Forced drops, qavg > max_thresh */
66 		u32	forced_mark;	/* Forced marks, qavg > max_thresh */
67 		u32	pdrop;          /* Drops due to queue limits */
68 		u32	other;          /* Drops due to drop() calls */
69 		u32	matched;	/* Drops to flow match */
70 	} stats;
71 
72 	unsigned int	 head;
73 	unsigned int	 tail;
74 
75 	unsigned int	 tab_mask; /* size - 1 */
76 
77 	struct sk_buff **tab;
78 };
79 
80 /* number of elements in queue including holes */
81 static unsigned int choke_len(const struct choke_sched_data *q)
82 {
83 	return (q->tail - q->head) & q->tab_mask;
84 }
85 
86 /* Is ECN parameter configured */
87 static int use_ecn(const struct choke_sched_data *q)
88 {
89 	return q->flags & TC_RED_ECN;
90 }
91 
92 /* Should packets over max just be dropped (versus marked) */
93 static int use_harddrop(const struct choke_sched_data *q)
94 {
95 	return q->flags & TC_RED_HARDDROP;
96 }
97 
98 /* Move head pointer forward to skip over holes */
99 static void choke_zap_head_holes(struct choke_sched_data *q)
100 {
101 	do {
102 		q->head = (q->head + 1) & q->tab_mask;
103 		if (q->head == q->tail)
104 			break;
105 	} while (q->tab[q->head] == NULL);
106 }
107 
108 /* Move tail pointer backwards to reuse holes */
109 static void choke_zap_tail_holes(struct choke_sched_data *q)
110 {
111 	do {
112 		q->tail = (q->tail - 1) & q->tab_mask;
113 		if (q->head == q->tail)
114 			break;
115 	} while (q->tab[q->tail] == NULL);
116 }
117 
118 /* Drop packet from queue array by creating a "hole" */
119 static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx,
120 			      struct sk_buff **to_free)
121 {
122 	struct choke_sched_data *q = qdisc_priv(sch);
123 	struct sk_buff *skb = q->tab[idx];
124 
125 	q->tab[idx] = NULL;
126 
127 	if (idx == q->head)
128 		choke_zap_head_holes(q);
129 	if (idx == q->tail)
130 		choke_zap_tail_holes(q);
131 
132 	qdisc_qstats_backlog_dec(sch, skb);
133 	qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
134 	qdisc_drop(skb, sch, to_free);
135 	--sch->q.qlen;
136 }
137 
138 struct choke_skb_cb {
139 	u16			classid;
140 	u8			keys_valid;
141 	struct			flow_keys_digest keys;
142 };
143 
144 static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
145 {
146 	qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
147 	return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
148 }
149 
150 static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
151 {
152 	choke_skb_cb(skb)->classid = classid;
153 }
154 
155 static u16 choke_get_classid(const struct sk_buff *skb)
156 {
157 	return choke_skb_cb(skb)->classid;
158 }
159 
160 /*
161  * Compare flow of two packets
162  *  Returns true only if source and destination address and port match.
163  *          false for special cases
164  */
165 static bool choke_match_flow(struct sk_buff *skb1,
166 			     struct sk_buff *skb2)
167 {
168 	struct flow_keys temp;
169 
170 	if (skb1->protocol != skb2->protocol)
171 		return false;
172 
173 	if (!choke_skb_cb(skb1)->keys_valid) {
174 		choke_skb_cb(skb1)->keys_valid = 1;
175 		skb_flow_dissect_flow_keys(skb1, &temp, 0);
176 		make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp);
177 	}
178 
179 	if (!choke_skb_cb(skb2)->keys_valid) {
180 		choke_skb_cb(skb2)->keys_valid = 1;
181 		skb_flow_dissect_flow_keys(skb2, &temp, 0);
182 		make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp);
183 	}
184 
185 	return !memcmp(&choke_skb_cb(skb1)->keys,
186 		       &choke_skb_cb(skb2)->keys,
187 		       sizeof(choke_skb_cb(skb1)->keys));
188 }
189 
190 /*
191  * Classify flow using either:
192  *  1. pre-existing classification result in skb
193  *  2. fast internal classification
194  *  3. use TC filter based classification
195  */
196 static bool choke_classify(struct sk_buff *skb,
197 			   struct Qdisc *sch, int *qerr)
198 
199 {
200 	struct choke_sched_data *q = qdisc_priv(sch);
201 	struct tcf_result res;
202 	struct tcf_proto *fl;
203 	int result;
204 
205 	fl = rcu_dereference_bh(q->filter_list);
206 	result = tc_classify(skb, fl, &res, false);
207 	if (result >= 0) {
208 #ifdef CONFIG_NET_CLS_ACT
209 		switch (result) {
210 		case TC_ACT_STOLEN:
211 		case TC_ACT_QUEUED:
212 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
213 		case TC_ACT_SHOT:
214 			return false;
215 		}
216 #endif
217 		choke_set_classid(skb, TC_H_MIN(res.classid));
218 		return true;
219 	}
220 
221 	return false;
222 }
223 
224 /*
225  * Select a packet at random from queue
226  * HACK: since queue can have holes from previous deletion; retry several
227  *   times to find a random skb but then just give up and return the head
228  * Will return NULL if queue is empty (q->head == q->tail)
229  */
230 static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
231 					 unsigned int *pidx)
232 {
233 	struct sk_buff *skb;
234 	int retrys = 3;
235 
236 	do {
237 		*pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask;
238 		skb = q->tab[*pidx];
239 		if (skb)
240 			return skb;
241 	} while (--retrys > 0);
242 
243 	return q->tab[*pidx = q->head];
244 }
245 
246 /*
247  * Compare new packet with random packet in queue
248  * returns true if matched and sets *pidx
249  */
250 static bool choke_match_random(const struct choke_sched_data *q,
251 			       struct sk_buff *nskb,
252 			       unsigned int *pidx)
253 {
254 	struct sk_buff *oskb;
255 
256 	if (q->head == q->tail)
257 		return false;
258 
259 	oskb = choke_peek_random(q, pidx);
260 	if (rcu_access_pointer(q->filter_list))
261 		return choke_get_classid(nskb) == choke_get_classid(oskb);
262 
263 	return choke_match_flow(oskb, nskb);
264 }
265 
266 static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch,
267 			 struct sk_buff **to_free)
268 {
269 	int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
270 	struct choke_sched_data *q = qdisc_priv(sch);
271 	const struct red_parms *p = &q->parms;
272 
273 	if (rcu_access_pointer(q->filter_list)) {
274 		/* If using external classifiers, get result and record it. */
275 		if (!choke_classify(skb, sch, &ret))
276 			goto other_drop;	/* Packet was eaten by filter */
277 	}
278 
279 	choke_skb_cb(skb)->keys_valid = 0;
280 	/* Compute average queue usage (see RED) */
281 	q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
282 	if (red_is_idling(&q->vars))
283 		red_end_of_idle_period(&q->vars);
284 
285 	/* Is queue small? */
286 	if (q->vars.qavg <= p->qth_min)
287 		q->vars.qcount = -1;
288 	else {
289 		unsigned int idx;
290 
291 		/* Draw a packet at random from queue and compare flow */
292 		if (choke_match_random(q, skb, &idx)) {
293 			q->stats.matched++;
294 			choke_drop_by_idx(sch, idx, to_free);
295 			goto congestion_drop;
296 		}
297 
298 		/* Queue is large, always mark/drop */
299 		if (q->vars.qavg > p->qth_max) {
300 			q->vars.qcount = -1;
301 
302 			qdisc_qstats_overlimit(sch);
303 			if (use_harddrop(q) || !use_ecn(q) ||
304 			    !INET_ECN_set_ce(skb)) {
305 				q->stats.forced_drop++;
306 				goto congestion_drop;
307 			}
308 
309 			q->stats.forced_mark++;
310 		} else if (++q->vars.qcount) {
311 			if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
312 				q->vars.qcount = 0;
313 				q->vars.qR = red_random(p);
314 
315 				qdisc_qstats_overlimit(sch);
316 				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
317 					q->stats.prob_drop++;
318 					goto congestion_drop;
319 				}
320 
321 				q->stats.prob_mark++;
322 			}
323 		} else
324 			q->vars.qR = red_random(p);
325 	}
326 
327 	/* Admit new packet */
328 	if (sch->q.qlen < q->limit) {
329 		q->tab[q->tail] = skb;
330 		q->tail = (q->tail + 1) & q->tab_mask;
331 		++sch->q.qlen;
332 		qdisc_qstats_backlog_inc(sch, skb);
333 		return NET_XMIT_SUCCESS;
334 	}
335 
336 	q->stats.pdrop++;
337 	return qdisc_drop(skb, sch, to_free);
338 
339 congestion_drop:
340 	qdisc_drop(skb, sch, to_free);
341 	return NET_XMIT_CN;
342 
343 other_drop:
344 	if (ret & __NET_XMIT_BYPASS)
345 		qdisc_qstats_drop(sch);
346 	__qdisc_drop(skb, to_free);
347 	return ret;
348 }
349 
350 static struct sk_buff *choke_dequeue(struct Qdisc *sch)
351 {
352 	struct choke_sched_data *q = qdisc_priv(sch);
353 	struct sk_buff *skb;
354 
355 	if (q->head == q->tail) {
356 		if (!red_is_idling(&q->vars))
357 			red_start_of_idle_period(&q->vars);
358 		return NULL;
359 	}
360 
361 	skb = q->tab[q->head];
362 	q->tab[q->head] = NULL;
363 	choke_zap_head_holes(q);
364 	--sch->q.qlen;
365 	qdisc_qstats_backlog_dec(sch, skb);
366 	qdisc_bstats_update(sch, skb);
367 
368 	return skb;
369 }
370 
371 static void choke_reset(struct Qdisc *sch)
372 {
373 	struct choke_sched_data *q = qdisc_priv(sch);
374 
375 	while (q->head != q->tail) {
376 		struct sk_buff *skb = q->tab[q->head];
377 
378 		q->head = (q->head + 1) & q->tab_mask;
379 		if (!skb)
380 			continue;
381 		rtnl_qdisc_drop(skb, sch);
382 	}
383 
384 	sch->q.qlen = 0;
385 	sch->qstats.backlog = 0;
386 	memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *));
387 	q->head = q->tail = 0;
388 	red_restart(&q->vars);
389 }
390 
391 static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
392 	[TCA_CHOKE_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
393 	[TCA_CHOKE_STAB]	= { .len = RED_STAB_SIZE },
394 	[TCA_CHOKE_MAX_P]	= { .type = NLA_U32 },
395 };
396 
397 
398 static void choke_free(void *addr)
399 {
400 	kvfree(addr);
401 }
402 
403 static int choke_change(struct Qdisc *sch, struct nlattr *opt)
404 {
405 	struct choke_sched_data *q = qdisc_priv(sch);
406 	struct nlattr *tb[TCA_CHOKE_MAX + 1];
407 	const struct tc_red_qopt *ctl;
408 	int err;
409 	struct sk_buff **old = NULL;
410 	unsigned int mask;
411 	u32 max_P;
412 
413 	if (opt == NULL)
414 		return -EINVAL;
415 
416 	err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
417 	if (err < 0)
418 		return err;
419 
420 	if (tb[TCA_CHOKE_PARMS] == NULL ||
421 	    tb[TCA_CHOKE_STAB] == NULL)
422 		return -EINVAL;
423 
424 	max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
425 
426 	ctl = nla_data(tb[TCA_CHOKE_PARMS]);
427 
428 	if (ctl->limit > CHOKE_MAX_QUEUE)
429 		return -EINVAL;
430 
431 	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
432 	if (mask != q->tab_mask) {
433 		struct sk_buff **ntab;
434 
435 		ntab = kcalloc(mask + 1, sizeof(struct sk_buff *),
436 			       GFP_KERNEL | __GFP_NOWARN);
437 		if (!ntab)
438 			ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
439 		if (!ntab)
440 			return -ENOMEM;
441 
442 		sch_tree_lock(sch);
443 		old = q->tab;
444 		if (old) {
445 			unsigned int oqlen = sch->q.qlen, tail = 0;
446 			unsigned dropped = 0;
447 
448 			while (q->head != q->tail) {
449 				struct sk_buff *skb = q->tab[q->head];
450 
451 				q->head = (q->head + 1) & q->tab_mask;
452 				if (!skb)
453 					continue;
454 				if (tail < mask) {
455 					ntab[tail++] = skb;
456 					continue;
457 				}
458 				dropped += qdisc_pkt_len(skb);
459 				qdisc_qstats_backlog_dec(sch, skb);
460 				--sch->q.qlen;
461 				rtnl_qdisc_drop(skb, sch);
462 			}
463 			qdisc_tree_reduce_backlog(sch, oqlen - sch->q.qlen, dropped);
464 			q->head = 0;
465 			q->tail = tail;
466 		}
467 
468 		q->tab_mask = mask;
469 		q->tab = ntab;
470 	} else
471 		sch_tree_lock(sch);
472 
473 	q->flags = ctl->flags;
474 	q->limit = ctl->limit;
475 
476 	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
477 		      ctl->Plog, ctl->Scell_log,
478 		      nla_data(tb[TCA_CHOKE_STAB]),
479 		      max_P);
480 	red_set_vars(&q->vars);
481 
482 	if (q->head == q->tail)
483 		red_end_of_idle_period(&q->vars);
484 
485 	sch_tree_unlock(sch);
486 	choke_free(old);
487 	return 0;
488 }
489 
490 static int choke_init(struct Qdisc *sch, struct nlattr *opt)
491 {
492 	return choke_change(sch, opt);
493 }
494 
495 static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
496 {
497 	struct choke_sched_data *q = qdisc_priv(sch);
498 	struct nlattr *opts = NULL;
499 	struct tc_red_qopt opt = {
500 		.limit		= q->limit,
501 		.flags		= q->flags,
502 		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
503 		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
504 		.Wlog		= q->parms.Wlog,
505 		.Plog		= q->parms.Plog,
506 		.Scell_log	= q->parms.Scell_log,
507 	};
508 
509 	opts = nla_nest_start(skb, TCA_OPTIONS);
510 	if (opts == NULL)
511 		goto nla_put_failure;
512 
513 	if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
514 	    nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
515 		goto nla_put_failure;
516 	return nla_nest_end(skb, opts);
517 
518 nla_put_failure:
519 	nla_nest_cancel(skb, opts);
520 	return -EMSGSIZE;
521 }
522 
523 static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
524 {
525 	struct choke_sched_data *q = qdisc_priv(sch);
526 	struct tc_choke_xstats st = {
527 		.early	= q->stats.prob_drop + q->stats.forced_drop,
528 		.marked	= q->stats.prob_mark + q->stats.forced_mark,
529 		.pdrop	= q->stats.pdrop,
530 		.other	= q->stats.other,
531 		.matched = q->stats.matched,
532 	};
533 
534 	return gnet_stats_copy_app(d, &st, sizeof(st));
535 }
536 
537 static void choke_destroy(struct Qdisc *sch)
538 {
539 	struct choke_sched_data *q = qdisc_priv(sch);
540 
541 	tcf_destroy_chain(&q->filter_list);
542 	choke_free(q->tab);
543 }
544 
545 static struct sk_buff *choke_peek_head(struct Qdisc *sch)
546 {
547 	struct choke_sched_data *q = qdisc_priv(sch);
548 
549 	return (q->head != q->tail) ? q->tab[q->head] : NULL;
550 }
551 
552 static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
553 	.id		=	"choke",
554 	.priv_size	=	sizeof(struct choke_sched_data),
555 
556 	.enqueue	=	choke_enqueue,
557 	.dequeue	=	choke_dequeue,
558 	.peek		=	choke_peek_head,
559 	.init		=	choke_init,
560 	.destroy	=	choke_destroy,
561 	.reset		=	choke_reset,
562 	.change		=	choke_change,
563 	.dump		=	choke_dump,
564 	.dump_stats	=	choke_dump_stats,
565 	.owner		=	THIS_MODULE,
566 };
567 
568 static int __init choke_module_init(void)
569 {
570 	return register_qdisc(&choke_qdisc_ops);
571 }
572 
573 static void __exit choke_module_exit(void)
574 {
575 	unregister_qdisc(&choke_qdisc_ops);
576 }
577 
578 module_init(choke_module_init)
579 module_exit(choke_module_exit)
580 
581 MODULE_LICENSE("GPL");
582