xref: /linux/net/sched/sch_choke.c (revision c98be0c96db00e9b6b02d31e0fa7590c54cdaaac)
1 /*
2  * net/sched/sch_choke.c	CHOKE scheduler
3  *
4  * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
5  * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * version 2 as published by the Free Software Foundation.
10  *
11  */
12 
13 #include <linux/module.h>
14 #include <linux/types.h>
15 #include <linux/kernel.h>
16 #include <linux/skbuff.h>
17 #include <linux/vmalloc.h>
18 #include <net/pkt_sched.h>
19 #include <net/inet_ecn.h>
20 #include <net/red.h>
21 #include <net/flow_keys.h>
22 
23 /*
24    CHOKe stateless AQM for fair bandwidth allocation
25    =================================================
26 
27    CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
28    unresponsive flows) is a variant of RED that penalizes misbehaving flows but
29    maintains no flow state. The difference from RED is an additional step
30    during the enqueuing process. If average queue size is over the
31    low threshold (qmin), a packet is chosen at random from the queue.
32    If both the new and chosen packet are from the same flow, both
33    are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
34    needs to access packets in queue randomly. It has a minimal class
35    interface to allow overriding the builtin flow classifier with
36    filters.
37 
38    Source:
39    R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
40    Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
41    IEEE INFOCOM, 2000.
42 
43    A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
44    Characteristics", IEEE/ACM Transactions on Networking, 2004
45 
46  */
47 
48 /* Upper bound on size of sk_buff table (packets) */
49 #define CHOKE_MAX_QUEUE	(128*1024 - 1)
50 
51 struct choke_sched_data {
52 /* Parameters */
53 	u32		 limit;
54 	unsigned char	 flags;
55 
56 	struct red_parms parms;
57 
58 /* Variables */
59 	struct red_vars  vars;
60 	struct tcf_proto *filter_list;
61 	struct {
62 		u32	prob_drop;	/* Early probability drops */
63 		u32	prob_mark;	/* Early probability marks */
64 		u32	forced_drop;	/* Forced drops, qavg > max_thresh */
65 		u32	forced_mark;	/* Forced marks, qavg > max_thresh */
66 		u32	pdrop;          /* Drops due to queue limits */
67 		u32	other;          /* Drops due to drop() calls */
68 		u32	matched;	/* Drops to flow match */
69 	} stats;
70 
71 	unsigned int	 head;
72 	unsigned int	 tail;
73 
74 	unsigned int	 tab_mask; /* size - 1 */
75 
76 	struct sk_buff **tab;
77 };
78 
79 /* number of elements in queue including holes */
80 static unsigned int choke_len(const struct choke_sched_data *q)
81 {
82 	return (q->tail - q->head) & q->tab_mask;
83 }
84 
85 /* Is ECN parameter configured */
86 static int use_ecn(const struct choke_sched_data *q)
87 {
88 	return q->flags & TC_RED_ECN;
89 }
90 
91 /* Should packets over max just be dropped (versus marked) */
92 static int use_harddrop(const struct choke_sched_data *q)
93 {
94 	return q->flags & TC_RED_HARDDROP;
95 }
96 
97 /* Move head pointer forward to skip over holes */
98 static void choke_zap_head_holes(struct choke_sched_data *q)
99 {
100 	do {
101 		q->head = (q->head + 1) & q->tab_mask;
102 		if (q->head == q->tail)
103 			break;
104 	} while (q->tab[q->head] == NULL);
105 }
106 
107 /* Move tail pointer backwards to reuse holes */
108 static void choke_zap_tail_holes(struct choke_sched_data *q)
109 {
110 	do {
111 		q->tail = (q->tail - 1) & q->tab_mask;
112 		if (q->head == q->tail)
113 			break;
114 	} while (q->tab[q->tail] == NULL);
115 }
116 
117 /* Drop packet from queue array by creating a "hole" */
118 static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
119 {
120 	struct choke_sched_data *q = qdisc_priv(sch);
121 	struct sk_buff *skb = q->tab[idx];
122 
123 	q->tab[idx] = NULL;
124 
125 	if (idx == q->head)
126 		choke_zap_head_holes(q);
127 	if (idx == q->tail)
128 		choke_zap_tail_holes(q);
129 
130 	sch->qstats.backlog -= qdisc_pkt_len(skb);
131 	qdisc_drop(skb, sch);
132 	qdisc_tree_decrease_qlen(sch, 1);
133 	--sch->q.qlen;
134 }
135 
136 struct choke_skb_cb {
137 	u16			classid;
138 	u8			keys_valid;
139 	struct flow_keys	keys;
140 };
141 
142 static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
143 {
144 	qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
145 	return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
146 }
147 
148 static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
149 {
150 	choke_skb_cb(skb)->classid = classid;
151 }
152 
153 static u16 choke_get_classid(const struct sk_buff *skb)
154 {
155 	return choke_skb_cb(skb)->classid;
156 }
157 
158 /*
159  * Compare flow of two packets
160  *  Returns true only if source and destination address and port match.
161  *          false for special cases
162  */
163 static bool choke_match_flow(struct sk_buff *skb1,
164 			     struct sk_buff *skb2)
165 {
166 	if (skb1->protocol != skb2->protocol)
167 		return false;
168 
169 	if (!choke_skb_cb(skb1)->keys_valid) {
170 		choke_skb_cb(skb1)->keys_valid = 1;
171 		skb_flow_dissect(skb1, &choke_skb_cb(skb1)->keys);
172 	}
173 
174 	if (!choke_skb_cb(skb2)->keys_valid) {
175 		choke_skb_cb(skb2)->keys_valid = 1;
176 		skb_flow_dissect(skb2, &choke_skb_cb(skb2)->keys);
177 	}
178 
179 	return !memcmp(&choke_skb_cb(skb1)->keys,
180 		       &choke_skb_cb(skb2)->keys,
181 		       sizeof(struct flow_keys));
182 }
183 
184 /*
185  * Classify flow using either:
186  *  1. pre-existing classification result in skb
187  *  2. fast internal classification
188  *  3. use TC filter based classification
189  */
190 static bool choke_classify(struct sk_buff *skb,
191 			   struct Qdisc *sch, int *qerr)
192 
193 {
194 	struct choke_sched_data *q = qdisc_priv(sch);
195 	struct tcf_result res;
196 	int result;
197 
198 	result = tc_classify(skb, q->filter_list, &res);
199 	if (result >= 0) {
200 #ifdef CONFIG_NET_CLS_ACT
201 		switch (result) {
202 		case TC_ACT_STOLEN:
203 		case TC_ACT_QUEUED:
204 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
205 		case TC_ACT_SHOT:
206 			return false;
207 		}
208 #endif
209 		choke_set_classid(skb, TC_H_MIN(res.classid));
210 		return true;
211 	}
212 
213 	return false;
214 }
215 
216 /*
217  * Select a packet at random from queue
218  * HACK: since queue can have holes from previous deletion; retry several
219  *   times to find a random skb but then just give up and return the head
220  * Will return NULL if queue is empty (q->head == q->tail)
221  */
222 static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
223 					 unsigned int *pidx)
224 {
225 	struct sk_buff *skb;
226 	int retrys = 3;
227 
228 	do {
229 		*pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask;
230 		skb = q->tab[*pidx];
231 		if (skb)
232 			return skb;
233 	} while (--retrys > 0);
234 
235 	return q->tab[*pidx = q->head];
236 }
237 
238 /*
239  * Compare new packet with random packet in queue
240  * returns true if matched and sets *pidx
241  */
242 static bool choke_match_random(const struct choke_sched_data *q,
243 			       struct sk_buff *nskb,
244 			       unsigned int *pidx)
245 {
246 	struct sk_buff *oskb;
247 
248 	if (q->head == q->tail)
249 		return false;
250 
251 	oskb = choke_peek_random(q, pidx);
252 	if (q->filter_list)
253 		return choke_get_classid(nskb) == choke_get_classid(oskb);
254 
255 	return choke_match_flow(oskb, nskb);
256 }
257 
258 static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
259 {
260 	struct choke_sched_data *q = qdisc_priv(sch);
261 	const struct red_parms *p = &q->parms;
262 	int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
263 
264 	if (q->filter_list) {
265 		/* If using external classifiers, get result and record it. */
266 		if (!choke_classify(skb, sch, &ret))
267 			goto other_drop;	/* Packet was eaten by filter */
268 	}
269 
270 	choke_skb_cb(skb)->keys_valid = 0;
271 	/* Compute average queue usage (see RED) */
272 	q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
273 	if (red_is_idling(&q->vars))
274 		red_end_of_idle_period(&q->vars);
275 
276 	/* Is queue small? */
277 	if (q->vars.qavg <= p->qth_min)
278 		q->vars.qcount = -1;
279 	else {
280 		unsigned int idx;
281 
282 		/* Draw a packet at random from queue and compare flow */
283 		if (choke_match_random(q, skb, &idx)) {
284 			q->stats.matched++;
285 			choke_drop_by_idx(sch, idx);
286 			goto congestion_drop;
287 		}
288 
289 		/* Queue is large, always mark/drop */
290 		if (q->vars.qavg > p->qth_max) {
291 			q->vars.qcount = -1;
292 
293 			sch->qstats.overlimits++;
294 			if (use_harddrop(q) || !use_ecn(q) ||
295 			    !INET_ECN_set_ce(skb)) {
296 				q->stats.forced_drop++;
297 				goto congestion_drop;
298 			}
299 
300 			q->stats.forced_mark++;
301 		} else if (++q->vars.qcount) {
302 			if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
303 				q->vars.qcount = 0;
304 				q->vars.qR = red_random(p);
305 
306 				sch->qstats.overlimits++;
307 				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
308 					q->stats.prob_drop++;
309 					goto congestion_drop;
310 				}
311 
312 				q->stats.prob_mark++;
313 			}
314 		} else
315 			q->vars.qR = red_random(p);
316 	}
317 
318 	/* Admit new packet */
319 	if (sch->q.qlen < q->limit) {
320 		q->tab[q->tail] = skb;
321 		q->tail = (q->tail + 1) & q->tab_mask;
322 		++sch->q.qlen;
323 		sch->qstats.backlog += qdisc_pkt_len(skb);
324 		return NET_XMIT_SUCCESS;
325 	}
326 
327 	q->stats.pdrop++;
328 	return qdisc_drop(skb, sch);
329 
330 congestion_drop:
331 	qdisc_drop(skb, sch);
332 	return NET_XMIT_CN;
333 
334 other_drop:
335 	if (ret & __NET_XMIT_BYPASS)
336 		sch->qstats.drops++;
337 	kfree_skb(skb);
338 	return ret;
339 }
340 
341 static struct sk_buff *choke_dequeue(struct Qdisc *sch)
342 {
343 	struct choke_sched_data *q = qdisc_priv(sch);
344 	struct sk_buff *skb;
345 
346 	if (q->head == q->tail) {
347 		if (!red_is_idling(&q->vars))
348 			red_start_of_idle_period(&q->vars);
349 		return NULL;
350 	}
351 
352 	skb = q->tab[q->head];
353 	q->tab[q->head] = NULL;
354 	choke_zap_head_holes(q);
355 	--sch->q.qlen;
356 	sch->qstats.backlog -= qdisc_pkt_len(skb);
357 	qdisc_bstats_update(sch, skb);
358 
359 	return skb;
360 }
361 
362 static unsigned int choke_drop(struct Qdisc *sch)
363 {
364 	struct choke_sched_data *q = qdisc_priv(sch);
365 	unsigned int len;
366 
367 	len = qdisc_queue_drop(sch);
368 	if (len > 0)
369 		q->stats.other++;
370 	else {
371 		if (!red_is_idling(&q->vars))
372 			red_start_of_idle_period(&q->vars);
373 	}
374 
375 	return len;
376 }
377 
378 static void choke_reset(struct Qdisc *sch)
379 {
380 	struct choke_sched_data *q = qdisc_priv(sch);
381 
382 	red_restart(&q->vars);
383 }
384 
385 static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
386 	[TCA_CHOKE_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
387 	[TCA_CHOKE_STAB]	= { .len = RED_STAB_SIZE },
388 	[TCA_CHOKE_MAX_P]	= { .type = NLA_U32 },
389 };
390 
391 
392 static void choke_free(void *addr)
393 {
394 	if (addr) {
395 		if (is_vmalloc_addr(addr))
396 			vfree(addr);
397 		else
398 			kfree(addr);
399 	}
400 }
401 
402 static int choke_change(struct Qdisc *sch, struct nlattr *opt)
403 {
404 	struct choke_sched_data *q = qdisc_priv(sch);
405 	struct nlattr *tb[TCA_CHOKE_MAX + 1];
406 	const struct tc_red_qopt *ctl;
407 	int err;
408 	struct sk_buff **old = NULL;
409 	unsigned int mask;
410 	u32 max_P;
411 
412 	if (opt == NULL)
413 		return -EINVAL;
414 
415 	err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
416 	if (err < 0)
417 		return err;
418 
419 	if (tb[TCA_CHOKE_PARMS] == NULL ||
420 	    tb[TCA_CHOKE_STAB] == NULL)
421 		return -EINVAL;
422 
423 	max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
424 
425 	ctl = nla_data(tb[TCA_CHOKE_PARMS]);
426 
427 	if (ctl->limit > CHOKE_MAX_QUEUE)
428 		return -EINVAL;
429 
430 	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
431 	if (mask != q->tab_mask) {
432 		struct sk_buff **ntab;
433 
434 		ntab = kcalloc(mask + 1, sizeof(struct sk_buff *),
435 			       GFP_KERNEL | __GFP_NOWARN);
436 		if (!ntab)
437 			ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
438 		if (!ntab)
439 			return -ENOMEM;
440 
441 		sch_tree_lock(sch);
442 		old = q->tab;
443 		if (old) {
444 			unsigned int oqlen = sch->q.qlen, tail = 0;
445 
446 			while (q->head != q->tail) {
447 				struct sk_buff *skb = q->tab[q->head];
448 
449 				q->head = (q->head + 1) & q->tab_mask;
450 				if (!skb)
451 					continue;
452 				if (tail < mask) {
453 					ntab[tail++] = skb;
454 					continue;
455 				}
456 				sch->qstats.backlog -= qdisc_pkt_len(skb);
457 				--sch->q.qlen;
458 				qdisc_drop(skb, sch);
459 			}
460 			qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen);
461 			q->head = 0;
462 			q->tail = tail;
463 		}
464 
465 		q->tab_mask = mask;
466 		q->tab = ntab;
467 	} else
468 		sch_tree_lock(sch);
469 
470 	q->flags = ctl->flags;
471 	q->limit = ctl->limit;
472 
473 	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
474 		      ctl->Plog, ctl->Scell_log,
475 		      nla_data(tb[TCA_CHOKE_STAB]),
476 		      max_P);
477 	red_set_vars(&q->vars);
478 
479 	if (q->head == q->tail)
480 		red_end_of_idle_period(&q->vars);
481 
482 	sch_tree_unlock(sch);
483 	choke_free(old);
484 	return 0;
485 }
486 
487 static int choke_init(struct Qdisc *sch, struct nlattr *opt)
488 {
489 	return choke_change(sch, opt);
490 }
491 
492 static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
493 {
494 	struct choke_sched_data *q = qdisc_priv(sch);
495 	struct nlattr *opts = NULL;
496 	struct tc_red_qopt opt = {
497 		.limit		= q->limit,
498 		.flags		= q->flags,
499 		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
500 		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
501 		.Wlog		= q->parms.Wlog,
502 		.Plog		= q->parms.Plog,
503 		.Scell_log	= q->parms.Scell_log,
504 	};
505 
506 	opts = nla_nest_start(skb, TCA_OPTIONS);
507 	if (opts == NULL)
508 		goto nla_put_failure;
509 
510 	if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
511 	    nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
512 		goto nla_put_failure;
513 	return nla_nest_end(skb, opts);
514 
515 nla_put_failure:
516 	nla_nest_cancel(skb, opts);
517 	return -EMSGSIZE;
518 }
519 
520 static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
521 {
522 	struct choke_sched_data *q = qdisc_priv(sch);
523 	struct tc_choke_xstats st = {
524 		.early	= q->stats.prob_drop + q->stats.forced_drop,
525 		.marked	= q->stats.prob_mark + q->stats.forced_mark,
526 		.pdrop	= q->stats.pdrop,
527 		.other	= q->stats.other,
528 		.matched = q->stats.matched,
529 	};
530 
531 	return gnet_stats_copy_app(d, &st, sizeof(st));
532 }
533 
534 static void choke_destroy(struct Qdisc *sch)
535 {
536 	struct choke_sched_data *q = qdisc_priv(sch);
537 
538 	tcf_destroy_chain(&q->filter_list);
539 	choke_free(q->tab);
540 }
541 
542 static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg)
543 {
544 	return NULL;
545 }
546 
547 static unsigned long choke_get(struct Qdisc *sch, u32 classid)
548 {
549 	return 0;
550 }
551 
552 static void choke_put(struct Qdisc *q, unsigned long cl)
553 {
554 }
555 
556 static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
557 				u32 classid)
558 {
559 	return 0;
560 }
561 
562 static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl)
563 {
564 	struct choke_sched_data *q = qdisc_priv(sch);
565 
566 	if (cl)
567 		return NULL;
568 	return &q->filter_list;
569 }
570 
571 static int choke_dump_class(struct Qdisc *sch, unsigned long cl,
572 			  struct sk_buff *skb, struct tcmsg *tcm)
573 {
574 	tcm->tcm_handle |= TC_H_MIN(cl);
575 	return 0;
576 }
577 
578 static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg)
579 {
580 	if (!arg->stop) {
581 		if (arg->fn(sch, 1, arg) < 0) {
582 			arg->stop = 1;
583 			return;
584 		}
585 		arg->count++;
586 	}
587 }
588 
589 static const struct Qdisc_class_ops choke_class_ops = {
590 	.leaf		=	choke_leaf,
591 	.get		=	choke_get,
592 	.put		=	choke_put,
593 	.tcf_chain	=	choke_find_tcf,
594 	.bind_tcf	=	choke_bind,
595 	.unbind_tcf	=	choke_put,
596 	.dump		=	choke_dump_class,
597 	.walk		=	choke_walk,
598 };
599 
600 static struct sk_buff *choke_peek_head(struct Qdisc *sch)
601 {
602 	struct choke_sched_data *q = qdisc_priv(sch);
603 
604 	return (q->head != q->tail) ? q->tab[q->head] : NULL;
605 }
606 
607 static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
608 	.id		=	"choke",
609 	.priv_size	=	sizeof(struct choke_sched_data),
610 
611 	.enqueue	=	choke_enqueue,
612 	.dequeue	=	choke_dequeue,
613 	.peek		=	choke_peek_head,
614 	.drop		=	choke_drop,
615 	.init		=	choke_init,
616 	.destroy	=	choke_destroy,
617 	.reset		=	choke_reset,
618 	.change		=	choke_change,
619 	.dump		=	choke_dump,
620 	.dump_stats	=	choke_dump_stats,
621 	.owner		=	THIS_MODULE,
622 };
623 
624 static int __init choke_module_init(void)
625 {
626 	return register_qdisc(&choke_qdisc_ops);
627 }
628 
629 static void __exit choke_module_exit(void)
630 {
631 	unregister_qdisc(&choke_qdisc_ops);
632 }
633 
634 module_init(choke_module_init)
635 module_exit(choke_module_exit)
636 
637 MODULE_LICENSE("GPL");
638