xref: /linux/net/sched/sch_choke.c (revision 0883c2c06fb5bcf5b9e008270827e63c09a88c1e)
1 /*
2  * net/sched/sch_choke.c	CHOKE scheduler
3  *
4  * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
5  * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * version 2 as published by the Free Software Foundation.
10  *
11  */
12 
13 #include <linux/module.h>
14 #include <linux/types.h>
15 #include <linux/kernel.h>
16 #include <linux/skbuff.h>
17 #include <linux/vmalloc.h>
18 #include <net/pkt_sched.h>
19 #include <net/inet_ecn.h>
20 #include <net/red.h>
21 #include <net/flow_dissector.h>
22 
23 /*
24    CHOKe stateless AQM for fair bandwidth allocation
25    =================================================
26 
27    CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
28    unresponsive flows) is a variant of RED that penalizes misbehaving flows but
29    maintains no flow state. The difference from RED is an additional step
30    during the enqueuing process. If average queue size is over the
31    low threshold (qmin), a packet is chosen at random from the queue.
32    If both the new and chosen packet are from the same flow, both
33    are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
34    needs to access packets in queue randomly. It has a minimal class
35    interface to allow overriding the builtin flow classifier with
36    filters.
37 
38    Source:
39    R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
40    Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
41    IEEE INFOCOM, 2000.
42 
43    A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
44    Characteristics", IEEE/ACM Transactions on Networking, 2004
45 
46  */
47 
48 /* Upper bound on size of sk_buff table (packets) */
49 #define CHOKE_MAX_QUEUE	(128*1024 - 1)
50 
51 struct choke_sched_data {
52 /* Parameters */
53 	u32		 limit;
54 	unsigned char	 flags;
55 
56 	struct red_parms parms;
57 
58 /* Variables */
59 	struct red_vars  vars;
60 	struct tcf_proto __rcu *filter_list;
61 	struct {
62 		u32	prob_drop;	/* Early probability drops */
63 		u32	prob_mark;	/* Early probability marks */
64 		u32	forced_drop;	/* Forced drops, qavg > max_thresh */
65 		u32	forced_mark;	/* Forced marks, qavg > max_thresh */
66 		u32	pdrop;          /* Drops due to queue limits */
67 		u32	other;          /* Drops due to drop() calls */
68 		u32	matched;	/* Drops to flow match */
69 	} stats;
70 
71 	unsigned int	 head;
72 	unsigned int	 tail;
73 
74 	unsigned int	 tab_mask; /* size - 1 */
75 
76 	struct sk_buff **tab;
77 };
78 
79 /* number of elements in queue including holes */
80 static unsigned int choke_len(const struct choke_sched_data *q)
81 {
82 	return (q->tail - q->head) & q->tab_mask;
83 }
84 
85 /* Is ECN parameter configured */
86 static int use_ecn(const struct choke_sched_data *q)
87 {
88 	return q->flags & TC_RED_ECN;
89 }
90 
91 /* Should packets over max just be dropped (versus marked) */
92 static int use_harddrop(const struct choke_sched_data *q)
93 {
94 	return q->flags & TC_RED_HARDDROP;
95 }
96 
97 /* Move head pointer forward to skip over holes */
98 static void choke_zap_head_holes(struct choke_sched_data *q)
99 {
100 	do {
101 		q->head = (q->head + 1) & q->tab_mask;
102 		if (q->head == q->tail)
103 			break;
104 	} while (q->tab[q->head] == NULL);
105 }
106 
107 /* Move tail pointer backwards to reuse holes */
108 static void choke_zap_tail_holes(struct choke_sched_data *q)
109 {
110 	do {
111 		q->tail = (q->tail - 1) & q->tab_mask;
112 		if (q->head == q->tail)
113 			break;
114 	} while (q->tab[q->tail] == NULL);
115 }
116 
117 /* Drop packet from queue array by creating a "hole" */
118 static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
119 {
120 	struct choke_sched_data *q = qdisc_priv(sch);
121 	struct sk_buff *skb = q->tab[idx];
122 
123 	q->tab[idx] = NULL;
124 
125 	if (idx == q->head)
126 		choke_zap_head_holes(q);
127 	if (idx == q->tail)
128 		choke_zap_tail_holes(q);
129 
130 	qdisc_qstats_backlog_dec(sch, skb);
131 	qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
132 	qdisc_drop(skb, sch);
133 	--sch->q.qlen;
134 }
135 
136 struct choke_skb_cb {
137 	u16			classid;
138 	u8			keys_valid;
139 	struct			flow_keys_digest keys;
140 };
141 
142 static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
143 {
144 	qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
145 	return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
146 }
147 
148 static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
149 {
150 	choke_skb_cb(skb)->classid = classid;
151 }
152 
153 static u16 choke_get_classid(const struct sk_buff *skb)
154 {
155 	return choke_skb_cb(skb)->classid;
156 }
157 
158 /*
159  * Compare flow of two packets
160  *  Returns true only if source and destination address and port match.
161  *          false for special cases
162  */
163 static bool choke_match_flow(struct sk_buff *skb1,
164 			     struct sk_buff *skb2)
165 {
166 	struct flow_keys temp;
167 
168 	if (skb1->protocol != skb2->protocol)
169 		return false;
170 
171 	if (!choke_skb_cb(skb1)->keys_valid) {
172 		choke_skb_cb(skb1)->keys_valid = 1;
173 		skb_flow_dissect_flow_keys(skb1, &temp, 0);
174 		make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp);
175 	}
176 
177 	if (!choke_skb_cb(skb2)->keys_valid) {
178 		choke_skb_cb(skb2)->keys_valid = 1;
179 		skb_flow_dissect_flow_keys(skb2, &temp, 0);
180 		make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp);
181 	}
182 
183 	return !memcmp(&choke_skb_cb(skb1)->keys,
184 		       &choke_skb_cb(skb2)->keys,
185 		       sizeof(choke_skb_cb(skb1)->keys));
186 }
187 
188 /*
189  * Classify flow using either:
190  *  1. pre-existing classification result in skb
191  *  2. fast internal classification
192  *  3. use TC filter based classification
193  */
194 static bool choke_classify(struct sk_buff *skb,
195 			   struct Qdisc *sch, int *qerr)
196 
197 {
198 	struct choke_sched_data *q = qdisc_priv(sch);
199 	struct tcf_result res;
200 	struct tcf_proto *fl;
201 	int result;
202 
203 	fl = rcu_dereference_bh(q->filter_list);
204 	result = tc_classify(skb, fl, &res, false);
205 	if (result >= 0) {
206 #ifdef CONFIG_NET_CLS_ACT
207 		switch (result) {
208 		case TC_ACT_STOLEN:
209 		case TC_ACT_QUEUED:
210 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
211 		case TC_ACT_SHOT:
212 			return false;
213 		}
214 #endif
215 		choke_set_classid(skb, TC_H_MIN(res.classid));
216 		return true;
217 	}
218 
219 	return false;
220 }
221 
222 /*
223  * Select a packet at random from queue
224  * HACK: since queue can have holes from previous deletion; retry several
225  *   times to find a random skb but then just give up and return the head
226  * Will return NULL if queue is empty (q->head == q->tail)
227  */
228 static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
229 					 unsigned int *pidx)
230 {
231 	struct sk_buff *skb;
232 	int retrys = 3;
233 
234 	do {
235 		*pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask;
236 		skb = q->tab[*pidx];
237 		if (skb)
238 			return skb;
239 	} while (--retrys > 0);
240 
241 	return q->tab[*pidx = q->head];
242 }
243 
244 /*
245  * Compare new packet with random packet in queue
246  * returns true if matched and sets *pidx
247  */
248 static bool choke_match_random(const struct choke_sched_data *q,
249 			       struct sk_buff *nskb,
250 			       unsigned int *pidx)
251 {
252 	struct sk_buff *oskb;
253 
254 	if (q->head == q->tail)
255 		return false;
256 
257 	oskb = choke_peek_random(q, pidx);
258 	if (rcu_access_pointer(q->filter_list))
259 		return choke_get_classid(nskb) == choke_get_classid(oskb);
260 
261 	return choke_match_flow(oskb, nskb);
262 }
263 
264 static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
265 {
266 	int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
267 	struct choke_sched_data *q = qdisc_priv(sch);
268 	const struct red_parms *p = &q->parms;
269 
270 	if (rcu_access_pointer(q->filter_list)) {
271 		/* If using external classifiers, get result and record it. */
272 		if (!choke_classify(skb, sch, &ret))
273 			goto other_drop;	/* Packet was eaten by filter */
274 	}
275 
276 	choke_skb_cb(skb)->keys_valid = 0;
277 	/* Compute average queue usage (see RED) */
278 	q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
279 	if (red_is_idling(&q->vars))
280 		red_end_of_idle_period(&q->vars);
281 
282 	/* Is queue small? */
283 	if (q->vars.qavg <= p->qth_min)
284 		q->vars.qcount = -1;
285 	else {
286 		unsigned int idx;
287 
288 		/* Draw a packet at random from queue and compare flow */
289 		if (choke_match_random(q, skb, &idx)) {
290 			q->stats.matched++;
291 			choke_drop_by_idx(sch, idx);
292 			goto congestion_drop;
293 		}
294 
295 		/* Queue is large, always mark/drop */
296 		if (q->vars.qavg > p->qth_max) {
297 			q->vars.qcount = -1;
298 
299 			qdisc_qstats_overlimit(sch);
300 			if (use_harddrop(q) || !use_ecn(q) ||
301 			    !INET_ECN_set_ce(skb)) {
302 				q->stats.forced_drop++;
303 				goto congestion_drop;
304 			}
305 
306 			q->stats.forced_mark++;
307 		} else if (++q->vars.qcount) {
308 			if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
309 				q->vars.qcount = 0;
310 				q->vars.qR = red_random(p);
311 
312 				qdisc_qstats_overlimit(sch);
313 				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
314 					q->stats.prob_drop++;
315 					goto congestion_drop;
316 				}
317 
318 				q->stats.prob_mark++;
319 			}
320 		} else
321 			q->vars.qR = red_random(p);
322 	}
323 
324 	/* Admit new packet */
325 	if (sch->q.qlen < q->limit) {
326 		q->tab[q->tail] = skb;
327 		q->tail = (q->tail + 1) & q->tab_mask;
328 		++sch->q.qlen;
329 		qdisc_qstats_backlog_inc(sch, skb);
330 		return NET_XMIT_SUCCESS;
331 	}
332 
333 	q->stats.pdrop++;
334 	return qdisc_drop(skb, sch);
335 
336 congestion_drop:
337 	qdisc_drop(skb, sch);
338 	return NET_XMIT_CN;
339 
340 other_drop:
341 	if (ret & __NET_XMIT_BYPASS)
342 		qdisc_qstats_drop(sch);
343 	kfree_skb(skb);
344 	return ret;
345 }
346 
347 static struct sk_buff *choke_dequeue(struct Qdisc *sch)
348 {
349 	struct choke_sched_data *q = qdisc_priv(sch);
350 	struct sk_buff *skb;
351 
352 	if (q->head == q->tail) {
353 		if (!red_is_idling(&q->vars))
354 			red_start_of_idle_period(&q->vars);
355 		return NULL;
356 	}
357 
358 	skb = q->tab[q->head];
359 	q->tab[q->head] = NULL;
360 	choke_zap_head_holes(q);
361 	--sch->q.qlen;
362 	qdisc_qstats_backlog_dec(sch, skb);
363 	qdisc_bstats_update(sch, skb);
364 
365 	return skb;
366 }
367 
368 static unsigned int choke_drop(struct Qdisc *sch)
369 {
370 	struct choke_sched_data *q = qdisc_priv(sch);
371 	unsigned int len;
372 
373 	len = qdisc_queue_drop(sch);
374 	if (len > 0)
375 		q->stats.other++;
376 	else {
377 		if (!red_is_idling(&q->vars))
378 			red_start_of_idle_period(&q->vars);
379 	}
380 
381 	return len;
382 }
383 
384 static void choke_reset(struct Qdisc *sch)
385 {
386 	struct choke_sched_data *q = qdisc_priv(sch);
387 
388 	while (q->head != q->tail) {
389 		struct sk_buff *skb = q->tab[q->head];
390 
391 		q->head = (q->head + 1) & q->tab_mask;
392 		if (!skb)
393 			continue;
394 		qdisc_qstats_backlog_dec(sch, skb);
395 		--sch->q.qlen;
396 		qdisc_drop(skb, sch);
397 	}
398 
399 	memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *));
400 	q->head = q->tail = 0;
401 	red_restart(&q->vars);
402 }
403 
404 static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
405 	[TCA_CHOKE_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
406 	[TCA_CHOKE_STAB]	= { .len = RED_STAB_SIZE },
407 	[TCA_CHOKE_MAX_P]	= { .type = NLA_U32 },
408 };
409 
410 
411 static void choke_free(void *addr)
412 {
413 	kvfree(addr);
414 }
415 
416 static int choke_change(struct Qdisc *sch, struct nlattr *opt)
417 {
418 	struct choke_sched_data *q = qdisc_priv(sch);
419 	struct nlattr *tb[TCA_CHOKE_MAX + 1];
420 	const struct tc_red_qopt *ctl;
421 	int err;
422 	struct sk_buff **old = NULL;
423 	unsigned int mask;
424 	u32 max_P;
425 
426 	if (opt == NULL)
427 		return -EINVAL;
428 
429 	err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
430 	if (err < 0)
431 		return err;
432 
433 	if (tb[TCA_CHOKE_PARMS] == NULL ||
434 	    tb[TCA_CHOKE_STAB] == NULL)
435 		return -EINVAL;
436 
437 	max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
438 
439 	ctl = nla_data(tb[TCA_CHOKE_PARMS]);
440 
441 	if (ctl->limit > CHOKE_MAX_QUEUE)
442 		return -EINVAL;
443 
444 	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
445 	if (mask != q->tab_mask) {
446 		struct sk_buff **ntab;
447 
448 		ntab = kcalloc(mask + 1, sizeof(struct sk_buff *),
449 			       GFP_KERNEL | __GFP_NOWARN);
450 		if (!ntab)
451 			ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
452 		if (!ntab)
453 			return -ENOMEM;
454 
455 		sch_tree_lock(sch);
456 		old = q->tab;
457 		if (old) {
458 			unsigned int oqlen = sch->q.qlen, tail = 0;
459 			unsigned dropped = 0;
460 
461 			while (q->head != q->tail) {
462 				struct sk_buff *skb = q->tab[q->head];
463 
464 				q->head = (q->head + 1) & q->tab_mask;
465 				if (!skb)
466 					continue;
467 				if (tail < mask) {
468 					ntab[tail++] = skb;
469 					continue;
470 				}
471 				dropped += qdisc_pkt_len(skb);
472 				qdisc_qstats_backlog_dec(sch, skb);
473 				--sch->q.qlen;
474 				qdisc_drop(skb, sch);
475 			}
476 			qdisc_tree_reduce_backlog(sch, oqlen - sch->q.qlen, dropped);
477 			q->head = 0;
478 			q->tail = tail;
479 		}
480 
481 		q->tab_mask = mask;
482 		q->tab = ntab;
483 	} else
484 		sch_tree_lock(sch);
485 
486 	q->flags = ctl->flags;
487 	q->limit = ctl->limit;
488 
489 	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
490 		      ctl->Plog, ctl->Scell_log,
491 		      nla_data(tb[TCA_CHOKE_STAB]),
492 		      max_P);
493 	red_set_vars(&q->vars);
494 
495 	if (q->head == q->tail)
496 		red_end_of_idle_period(&q->vars);
497 
498 	sch_tree_unlock(sch);
499 	choke_free(old);
500 	return 0;
501 }
502 
503 static int choke_init(struct Qdisc *sch, struct nlattr *opt)
504 {
505 	return choke_change(sch, opt);
506 }
507 
508 static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
509 {
510 	struct choke_sched_data *q = qdisc_priv(sch);
511 	struct nlattr *opts = NULL;
512 	struct tc_red_qopt opt = {
513 		.limit		= q->limit,
514 		.flags		= q->flags,
515 		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
516 		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
517 		.Wlog		= q->parms.Wlog,
518 		.Plog		= q->parms.Plog,
519 		.Scell_log	= q->parms.Scell_log,
520 	};
521 
522 	opts = nla_nest_start(skb, TCA_OPTIONS);
523 	if (opts == NULL)
524 		goto nla_put_failure;
525 
526 	if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
527 	    nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
528 		goto nla_put_failure;
529 	return nla_nest_end(skb, opts);
530 
531 nla_put_failure:
532 	nla_nest_cancel(skb, opts);
533 	return -EMSGSIZE;
534 }
535 
536 static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
537 {
538 	struct choke_sched_data *q = qdisc_priv(sch);
539 	struct tc_choke_xstats st = {
540 		.early	= q->stats.prob_drop + q->stats.forced_drop,
541 		.marked	= q->stats.prob_mark + q->stats.forced_mark,
542 		.pdrop	= q->stats.pdrop,
543 		.other	= q->stats.other,
544 		.matched = q->stats.matched,
545 	};
546 
547 	return gnet_stats_copy_app(d, &st, sizeof(st));
548 }
549 
550 static void choke_destroy(struct Qdisc *sch)
551 {
552 	struct choke_sched_data *q = qdisc_priv(sch);
553 
554 	tcf_destroy_chain(&q->filter_list);
555 	choke_free(q->tab);
556 }
557 
558 static struct sk_buff *choke_peek_head(struct Qdisc *sch)
559 {
560 	struct choke_sched_data *q = qdisc_priv(sch);
561 
562 	return (q->head != q->tail) ? q->tab[q->head] : NULL;
563 }
564 
565 static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
566 	.id		=	"choke",
567 	.priv_size	=	sizeof(struct choke_sched_data),
568 
569 	.enqueue	=	choke_enqueue,
570 	.dequeue	=	choke_dequeue,
571 	.peek		=	choke_peek_head,
572 	.drop		=	choke_drop,
573 	.init		=	choke_init,
574 	.destroy	=	choke_destroy,
575 	.reset		=	choke_reset,
576 	.change		=	choke_change,
577 	.dump		=	choke_dump,
578 	.dump_stats	=	choke_dump_stats,
579 	.owner		=	THIS_MODULE,
580 };
581 
582 static int __init choke_module_init(void)
583 {
584 	return register_qdisc(&choke_qdisc_ops);
585 }
586 
587 static void __exit choke_module_exit(void)
588 {
589 	unregister_qdisc(&choke_qdisc_ops);
590 }
591 
592 module_init(choke_module_init)
593 module_exit(choke_module_exit)
594 
595 MODULE_LICENSE("GPL");
596