xref: /linux/net/sched/sch_choke.c (revision b85d45947951d23cb22d90caecf4c1eb81342c96)
1 /*
2  * net/sched/sch_choke.c	CHOKE scheduler
3  *
4  * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
5  * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * version 2 as published by the Free Software Foundation.
10  *
11  */
12 
13 #include <linux/module.h>
14 #include <linux/types.h>
15 #include <linux/kernel.h>
16 #include <linux/skbuff.h>
17 #include <linux/vmalloc.h>
18 #include <net/pkt_sched.h>
19 #include <net/inet_ecn.h>
20 #include <net/red.h>
21 #include <net/flow_dissector.h>
22 
23 /*
24    CHOKe stateless AQM for fair bandwidth allocation
25    =================================================
26 
27    CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
28    unresponsive flows) is a variant of RED that penalizes misbehaving flows but
29    maintains no flow state. The difference from RED is an additional step
30    during the enqueuing process. If average queue size is over the
31    low threshold (qmin), a packet is chosen at random from the queue.
32    If both the new and chosen packet are from the same flow, both
33    are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
34    needs to access packets in queue randomly. It has a minimal class
35    interface to allow overriding the builtin flow classifier with
36    filters.
37 
38    Source:
39    R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
40    Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
41    IEEE INFOCOM, 2000.
42 
43    A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
44    Characteristics", IEEE/ACM Transactions on Networking, 2004
45 
46  */
47 
48 /* Upper bound on size of sk_buff table (packets) */
49 #define CHOKE_MAX_QUEUE	(128*1024 - 1)
50 
51 struct choke_sched_data {
52 /* Parameters */
53 	u32		 limit;
54 	unsigned char	 flags;
55 
56 	struct red_parms parms;
57 
58 /* Variables */
59 	struct red_vars  vars;
60 	struct tcf_proto __rcu *filter_list;
61 	struct {
62 		u32	prob_drop;	/* Early probability drops */
63 		u32	prob_mark;	/* Early probability marks */
64 		u32	forced_drop;	/* Forced drops, qavg > max_thresh */
65 		u32	forced_mark;	/* Forced marks, qavg > max_thresh */
66 		u32	pdrop;          /* Drops due to queue limits */
67 		u32	other;          /* Drops due to drop() calls */
68 		u32	matched;	/* Drops to flow match */
69 	} stats;
70 
71 	unsigned int	 head;
72 	unsigned int	 tail;
73 
74 	unsigned int	 tab_mask; /* size - 1 */
75 
76 	struct sk_buff **tab;
77 };
78 
79 /* number of elements in queue including holes */
80 static unsigned int choke_len(const struct choke_sched_data *q)
81 {
82 	return (q->tail - q->head) & q->tab_mask;
83 }
84 
85 /* Is ECN parameter configured */
86 static int use_ecn(const struct choke_sched_data *q)
87 {
88 	return q->flags & TC_RED_ECN;
89 }
90 
91 /* Should packets over max just be dropped (versus marked) */
92 static int use_harddrop(const struct choke_sched_data *q)
93 {
94 	return q->flags & TC_RED_HARDDROP;
95 }
96 
97 /* Move head pointer forward to skip over holes */
98 static void choke_zap_head_holes(struct choke_sched_data *q)
99 {
100 	do {
101 		q->head = (q->head + 1) & q->tab_mask;
102 		if (q->head == q->tail)
103 			break;
104 	} while (q->tab[q->head] == NULL);
105 }
106 
107 /* Move tail pointer backwards to reuse holes */
108 static void choke_zap_tail_holes(struct choke_sched_data *q)
109 {
110 	do {
111 		q->tail = (q->tail - 1) & q->tab_mask;
112 		if (q->head == q->tail)
113 			break;
114 	} while (q->tab[q->tail] == NULL);
115 }
116 
117 /* Drop packet from queue array by creating a "hole" */
118 static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
119 {
120 	struct choke_sched_data *q = qdisc_priv(sch);
121 	struct sk_buff *skb = q->tab[idx];
122 
123 	q->tab[idx] = NULL;
124 
125 	if (idx == q->head)
126 		choke_zap_head_holes(q);
127 	if (idx == q->tail)
128 		choke_zap_tail_holes(q);
129 
130 	qdisc_qstats_backlog_dec(sch, skb);
131 	qdisc_drop(skb, sch);
132 	qdisc_tree_decrease_qlen(sch, 1);
133 	--sch->q.qlen;
134 }
135 
136 struct choke_skb_cb {
137 	u16			classid;
138 	u8			keys_valid;
139 	struct			flow_keys_digest keys;
140 };
141 
142 static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
143 {
144 	qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
145 	return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
146 }
147 
148 static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
149 {
150 	choke_skb_cb(skb)->classid = classid;
151 }
152 
153 static u16 choke_get_classid(const struct sk_buff *skb)
154 {
155 	return choke_skb_cb(skb)->classid;
156 }
157 
158 /*
159  * Compare flow of two packets
160  *  Returns true only if source and destination address and port match.
161  *          false for special cases
162  */
163 static bool choke_match_flow(struct sk_buff *skb1,
164 			     struct sk_buff *skb2)
165 {
166 	struct flow_keys temp;
167 
168 	if (skb1->protocol != skb2->protocol)
169 		return false;
170 
171 	if (!choke_skb_cb(skb1)->keys_valid) {
172 		choke_skb_cb(skb1)->keys_valid = 1;
173 		skb_flow_dissect_flow_keys(skb1, &temp, 0);
174 		make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp);
175 	}
176 
177 	if (!choke_skb_cb(skb2)->keys_valid) {
178 		choke_skb_cb(skb2)->keys_valid = 1;
179 		skb_flow_dissect_flow_keys(skb2, &temp, 0);
180 		make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp);
181 	}
182 
183 	return !memcmp(&choke_skb_cb(skb1)->keys,
184 		       &choke_skb_cb(skb2)->keys,
185 		       sizeof(choke_skb_cb(skb1)->keys));
186 }
187 
188 /*
189  * Classify flow using either:
190  *  1. pre-existing classification result in skb
191  *  2. fast internal classification
192  *  3. use TC filter based classification
193  */
194 static bool choke_classify(struct sk_buff *skb,
195 			   struct Qdisc *sch, int *qerr)
196 
197 {
198 	struct choke_sched_data *q = qdisc_priv(sch);
199 	struct tcf_result res;
200 	struct tcf_proto *fl;
201 	int result;
202 
203 	fl = rcu_dereference_bh(q->filter_list);
204 	result = tc_classify(skb, fl, &res, false);
205 	if (result >= 0) {
206 #ifdef CONFIG_NET_CLS_ACT
207 		switch (result) {
208 		case TC_ACT_STOLEN:
209 		case TC_ACT_QUEUED:
210 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
211 		case TC_ACT_SHOT:
212 			return false;
213 		}
214 #endif
215 		choke_set_classid(skb, TC_H_MIN(res.classid));
216 		return true;
217 	}
218 
219 	return false;
220 }
221 
222 /*
223  * Select a packet at random from queue
224  * HACK: since queue can have holes from previous deletion; retry several
225  *   times to find a random skb but then just give up and return the head
226  * Will return NULL if queue is empty (q->head == q->tail)
227  */
228 static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
229 					 unsigned int *pidx)
230 {
231 	struct sk_buff *skb;
232 	int retrys = 3;
233 
234 	do {
235 		*pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask;
236 		skb = q->tab[*pidx];
237 		if (skb)
238 			return skb;
239 	} while (--retrys > 0);
240 
241 	return q->tab[*pidx = q->head];
242 }
243 
244 /*
245  * Compare new packet with random packet in queue
246  * returns true if matched and sets *pidx
247  */
248 static bool choke_match_random(const struct choke_sched_data *q,
249 			       struct sk_buff *nskb,
250 			       unsigned int *pidx)
251 {
252 	struct sk_buff *oskb;
253 
254 	if (q->head == q->tail)
255 		return false;
256 
257 	oskb = choke_peek_random(q, pidx);
258 	if (rcu_access_pointer(q->filter_list))
259 		return choke_get_classid(nskb) == choke_get_classid(oskb);
260 
261 	return choke_match_flow(oskb, nskb);
262 }
263 
264 static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
265 {
266 	int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
267 	struct choke_sched_data *q = qdisc_priv(sch);
268 	const struct red_parms *p = &q->parms;
269 
270 	if (rcu_access_pointer(q->filter_list)) {
271 		/* If using external classifiers, get result and record it. */
272 		if (!choke_classify(skb, sch, &ret))
273 			goto other_drop;	/* Packet was eaten by filter */
274 	}
275 
276 	choke_skb_cb(skb)->keys_valid = 0;
277 	/* Compute average queue usage (see RED) */
278 	q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
279 	if (red_is_idling(&q->vars))
280 		red_end_of_idle_period(&q->vars);
281 
282 	/* Is queue small? */
283 	if (q->vars.qavg <= p->qth_min)
284 		q->vars.qcount = -1;
285 	else {
286 		unsigned int idx;
287 
288 		/* Draw a packet at random from queue and compare flow */
289 		if (choke_match_random(q, skb, &idx)) {
290 			q->stats.matched++;
291 			choke_drop_by_idx(sch, idx);
292 			goto congestion_drop;
293 		}
294 
295 		/* Queue is large, always mark/drop */
296 		if (q->vars.qavg > p->qth_max) {
297 			q->vars.qcount = -1;
298 
299 			qdisc_qstats_overlimit(sch);
300 			if (use_harddrop(q) || !use_ecn(q) ||
301 			    !INET_ECN_set_ce(skb)) {
302 				q->stats.forced_drop++;
303 				goto congestion_drop;
304 			}
305 
306 			q->stats.forced_mark++;
307 		} else if (++q->vars.qcount) {
308 			if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
309 				q->vars.qcount = 0;
310 				q->vars.qR = red_random(p);
311 
312 				qdisc_qstats_overlimit(sch);
313 				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
314 					q->stats.prob_drop++;
315 					goto congestion_drop;
316 				}
317 
318 				q->stats.prob_mark++;
319 			}
320 		} else
321 			q->vars.qR = red_random(p);
322 	}
323 
324 	/* Admit new packet */
325 	if (sch->q.qlen < q->limit) {
326 		q->tab[q->tail] = skb;
327 		q->tail = (q->tail + 1) & q->tab_mask;
328 		++sch->q.qlen;
329 		qdisc_qstats_backlog_inc(sch, skb);
330 		return NET_XMIT_SUCCESS;
331 	}
332 
333 	q->stats.pdrop++;
334 	return qdisc_drop(skb, sch);
335 
336 congestion_drop:
337 	qdisc_drop(skb, sch);
338 	return NET_XMIT_CN;
339 
340 other_drop:
341 	if (ret & __NET_XMIT_BYPASS)
342 		qdisc_qstats_drop(sch);
343 	kfree_skb(skb);
344 	return ret;
345 }
346 
347 static struct sk_buff *choke_dequeue(struct Qdisc *sch)
348 {
349 	struct choke_sched_data *q = qdisc_priv(sch);
350 	struct sk_buff *skb;
351 
352 	if (q->head == q->tail) {
353 		if (!red_is_idling(&q->vars))
354 			red_start_of_idle_period(&q->vars);
355 		return NULL;
356 	}
357 
358 	skb = q->tab[q->head];
359 	q->tab[q->head] = NULL;
360 	choke_zap_head_holes(q);
361 	--sch->q.qlen;
362 	qdisc_qstats_backlog_dec(sch, skb);
363 	qdisc_bstats_update(sch, skb);
364 
365 	return skb;
366 }
367 
368 static unsigned int choke_drop(struct Qdisc *sch)
369 {
370 	struct choke_sched_data *q = qdisc_priv(sch);
371 	unsigned int len;
372 
373 	len = qdisc_queue_drop(sch);
374 	if (len > 0)
375 		q->stats.other++;
376 	else {
377 		if (!red_is_idling(&q->vars))
378 			red_start_of_idle_period(&q->vars);
379 	}
380 
381 	return len;
382 }
383 
384 static void choke_reset(struct Qdisc *sch)
385 {
386 	struct choke_sched_data *q = qdisc_priv(sch);
387 
388 	while (q->head != q->tail) {
389 		struct sk_buff *skb = q->tab[q->head];
390 
391 		q->head = (q->head + 1) & q->tab_mask;
392 		if (!skb)
393 			continue;
394 		qdisc_qstats_backlog_dec(sch, skb);
395 		--sch->q.qlen;
396 		qdisc_drop(skb, sch);
397 	}
398 
399 	memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *));
400 	q->head = q->tail = 0;
401 	red_restart(&q->vars);
402 }
403 
404 static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
405 	[TCA_CHOKE_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
406 	[TCA_CHOKE_STAB]	= { .len = RED_STAB_SIZE },
407 	[TCA_CHOKE_MAX_P]	= { .type = NLA_U32 },
408 };
409 
410 
411 static void choke_free(void *addr)
412 {
413 	kvfree(addr);
414 }
415 
416 static int choke_change(struct Qdisc *sch, struct nlattr *opt)
417 {
418 	struct choke_sched_data *q = qdisc_priv(sch);
419 	struct nlattr *tb[TCA_CHOKE_MAX + 1];
420 	const struct tc_red_qopt *ctl;
421 	int err;
422 	struct sk_buff **old = NULL;
423 	unsigned int mask;
424 	u32 max_P;
425 
426 	if (opt == NULL)
427 		return -EINVAL;
428 
429 	err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
430 	if (err < 0)
431 		return err;
432 
433 	if (tb[TCA_CHOKE_PARMS] == NULL ||
434 	    tb[TCA_CHOKE_STAB] == NULL)
435 		return -EINVAL;
436 
437 	max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
438 
439 	ctl = nla_data(tb[TCA_CHOKE_PARMS]);
440 
441 	if (ctl->limit > CHOKE_MAX_QUEUE)
442 		return -EINVAL;
443 
444 	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
445 	if (mask != q->tab_mask) {
446 		struct sk_buff **ntab;
447 
448 		ntab = kcalloc(mask + 1, sizeof(struct sk_buff *),
449 			       GFP_KERNEL | __GFP_NOWARN);
450 		if (!ntab)
451 			ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
452 		if (!ntab)
453 			return -ENOMEM;
454 
455 		sch_tree_lock(sch);
456 		old = q->tab;
457 		if (old) {
458 			unsigned int oqlen = sch->q.qlen, tail = 0;
459 
460 			while (q->head != q->tail) {
461 				struct sk_buff *skb = q->tab[q->head];
462 
463 				q->head = (q->head + 1) & q->tab_mask;
464 				if (!skb)
465 					continue;
466 				if (tail < mask) {
467 					ntab[tail++] = skb;
468 					continue;
469 				}
470 				qdisc_qstats_backlog_dec(sch, skb);
471 				--sch->q.qlen;
472 				qdisc_drop(skb, sch);
473 			}
474 			qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen);
475 			q->head = 0;
476 			q->tail = tail;
477 		}
478 
479 		q->tab_mask = mask;
480 		q->tab = ntab;
481 	} else
482 		sch_tree_lock(sch);
483 
484 	q->flags = ctl->flags;
485 	q->limit = ctl->limit;
486 
487 	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
488 		      ctl->Plog, ctl->Scell_log,
489 		      nla_data(tb[TCA_CHOKE_STAB]),
490 		      max_P);
491 	red_set_vars(&q->vars);
492 
493 	if (q->head == q->tail)
494 		red_end_of_idle_period(&q->vars);
495 
496 	sch_tree_unlock(sch);
497 	choke_free(old);
498 	return 0;
499 }
500 
501 static int choke_init(struct Qdisc *sch, struct nlattr *opt)
502 {
503 	return choke_change(sch, opt);
504 }
505 
506 static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
507 {
508 	struct choke_sched_data *q = qdisc_priv(sch);
509 	struct nlattr *opts = NULL;
510 	struct tc_red_qopt opt = {
511 		.limit		= q->limit,
512 		.flags		= q->flags,
513 		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
514 		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
515 		.Wlog		= q->parms.Wlog,
516 		.Plog		= q->parms.Plog,
517 		.Scell_log	= q->parms.Scell_log,
518 	};
519 
520 	opts = nla_nest_start(skb, TCA_OPTIONS);
521 	if (opts == NULL)
522 		goto nla_put_failure;
523 
524 	if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
525 	    nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
526 		goto nla_put_failure;
527 	return nla_nest_end(skb, opts);
528 
529 nla_put_failure:
530 	nla_nest_cancel(skb, opts);
531 	return -EMSGSIZE;
532 }
533 
534 static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
535 {
536 	struct choke_sched_data *q = qdisc_priv(sch);
537 	struct tc_choke_xstats st = {
538 		.early	= q->stats.prob_drop + q->stats.forced_drop,
539 		.marked	= q->stats.prob_mark + q->stats.forced_mark,
540 		.pdrop	= q->stats.pdrop,
541 		.other	= q->stats.other,
542 		.matched = q->stats.matched,
543 	};
544 
545 	return gnet_stats_copy_app(d, &st, sizeof(st));
546 }
547 
548 static void choke_destroy(struct Qdisc *sch)
549 {
550 	struct choke_sched_data *q = qdisc_priv(sch);
551 
552 	tcf_destroy_chain(&q->filter_list);
553 	choke_free(q->tab);
554 }
555 
556 static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg)
557 {
558 	return NULL;
559 }
560 
561 static unsigned long choke_get(struct Qdisc *sch, u32 classid)
562 {
563 	return 0;
564 }
565 
566 static void choke_put(struct Qdisc *q, unsigned long cl)
567 {
568 }
569 
570 static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
571 				u32 classid)
572 {
573 	return 0;
574 }
575 
576 static struct tcf_proto __rcu **choke_find_tcf(struct Qdisc *sch,
577 					       unsigned long cl)
578 {
579 	struct choke_sched_data *q = qdisc_priv(sch);
580 
581 	if (cl)
582 		return NULL;
583 	return &q->filter_list;
584 }
585 
586 static int choke_dump_class(struct Qdisc *sch, unsigned long cl,
587 			  struct sk_buff *skb, struct tcmsg *tcm)
588 {
589 	tcm->tcm_handle |= TC_H_MIN(cl);
590 	return 0;
591 }
592 
593 static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg)
594 {
595 	if (!arg->stop) {
596 		if (arg->fn(sch, 1, arg) < 0) {
597 			arg->stop = 1;
598 			return;
599 		}
600 		arg->count++;
601 	}
602 }
603 
604 static const struct Qdisc_class_ops choke_class_ops = {
605 	.leaf		=	choke_leaf,
606 	.get		=	choke_get,
607 	.put		=	choke_put,
608 	.tcf_chain	=	choke_find_tcf,
609 	.bind_tcf	=	choke_bind,
610 	.unbind_tcf	=	choke_put,
611 	.dump		=	choke_dump_class,
612 	.walk		=	choke_walk,
613 };
614 
615 static struct sk_buff *choke_peek_head(struct Qdisc *sch)
616 {
617 	struct choke_sched_data *q = qdisc_priv(sch);
618 
619 	return (q->head != q->tail) ? q->tab[q->head] : NULL;
620 }
621 
622 static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
623 	.id		=	"choke",
624 	.priv_size	=	sizeof(struct choke_sched_data),
625 
626 	.enqueue	=	choke_enqueue,
627 	.dequeue	=	choke_dequeue,
628 	.peek		=	choke_peek_head,
629 	.drop		=	choke_drop,
630 	.init		=	choke_init,
631 	.destroy	=	choke_destroy,
632 	.reset		=	choke_reset,
633 	.change		=	choke_change,
634 	.dump		=	choke_dump,
635 	.dump_stats	=	choke_dump_stats,
636 	.owner		=	THIS_MODULE,
637 };
638 
639 static int __init choke_module_init(void)
640 {
641 	return register_qdisc(&choke_qdisc_ops);
642 }
643 
644 static void __exit choke_module_exit(void)
645 {
646 	unregister_qdisc(&choke_qdisc_ops);
647 }
648 
649 module_init(choke_module_init)
650 module_exit(choke_module_exit)
651 
652 MODULE_LICENSE("GPL");
653