xref: /linux/net/sched/sch_choke.c (revision e58e871becec2d3b04ed91c0c16fe8deac9c9dfa)
1 /*
2  * net/sched/sch_choke.c	CHOKE scheduler
3  *
4  * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
5  * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * version 2 as published by the Free Software Foundation.
10  *
11  */
12 
13 #include <linux/module.h>
14 #include <linux/types.h>
15 #include <linux/kernel.h>
16 #include <linux/skbuff.h>
17 #include <linux/vmalloc.h>
18 #include <net/pkt_sched.h>
19 #include <net/pkt_cls.h>
20 #include <net/inet_ecn.h>
21 #include <net/red.h>
22 #include <net/flow_dissector.h>
23 
24 /*
25    CHOKe stateless AQM for fair bandwidth allocation
26    =================================================
27 
28    CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
29    unresponsive flows) is a variant of RED that penalizes misbehaving flows but
30    maintains no flow state. The difference from RED is an additional step
31    during the enqueuing process. If average queue size is over the
32    low threshold (qmin), a packet is chosen at random from the queue.
33    If both the new and chosen packet are from the same flow, both
34    are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
35    needs to access packets in queue randomly. It has a minimal class
36    interface to allow overriding the builtin flow classifier with
37    filters.
38 
39    Source:
40    R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
41    Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
42    IEEE INFOCOM, 2000.
43 
44    A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
45    Characteristics", IEEE/ACM Transactions on Networking, 2004
46 
47  */
48 
49 /* Upper bound on size of sk_buff table (packets) */
50 #define CHOKE_MAX_QUEUE	(128*1024 - 1)
51 
52 struct choke_sched_data {
53 /* Parameters */
54 	u32		 limit;
55 	unsigned char	 flags;
56 
57 	struct red_parms parms;
58 
59 /* Variables */
60 	struct red_vars  vars;
61 	struct {
62 		u32	prob_drop;	/* Early probability drops */
63 		u32	prob_mark;	/* Early probability marks */
64 		u32	forced_drop;	/* Forced drops, qavg > max_thresh */
65 		u32	forced_mark;	/* Forced marks, qavg > max_thresh */
66 		u32	pdrop;          /* Drops due to queue limits */
67 		u32	other;          /* Drops due to drop() calls */
68 		u32	matched;	/* Drops to flow match */
69 	} stats;
70 
71 	unsigned int	 head;
72 	unsigned int	 tail;
73 
74 	unsigned int	 tab_mask; /* size - 1 */
75 
76 	struct sk_buff **tab;
77 };
78 
79 /* number of elements in queue including holes */
80 static unsigned int choke_len(const struct choke_sched_data *q)
81 {
82 	return (q->tail - q->head) & q->tab_mask;
83 }
84 
85 /* Is ECN parameter configured */
86 static int use_ecn(const struct choke_sched_data *q)
87 {
88 	return q->flags & TC_RED_ECN;
89 }
90 
91 /* Should packets over max just be dropped (versus marked) */
92 static int use_harddrop(const struct choke_sched_data *q)
93 {
94 	return q->flags & TC_RED_HARDDROP;
95 }
96 
97 /* Move head pointer forward to skip over holes */
98 static void choke_zap_head_holes(struct choke_sched_data *q)
99 {
100 	do {
101 		q->head = (q->head + 1) & q->tab_mask;
102 		if (q->head == q->tail)
103 			break;
104 	} while (q->tab[q->head] == NULL);
105 }
106 
107 /* Move tail pointer backwards to reuse holes */
108 static void choke_zap_tail_holes(struct choke_sched_data *q)
109 {
110 	do {
111 		q->tail = (q->tail - 1) & q->tab_mask;
112 		if (q->head == q->tail)
113 			break;
114 	} while (q->tab[q->tail] == NULL);
115 }
116 
117 /* Drop packet from queue array by creating a "hole" */
118 static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx,
119 			      struct sk_buff **to_free)
120 {
121 	struct choke_sched_data *q = qdisc_priv(sch);
122 	struct sk_buff *skb = q->tab[idx];
123 
124 	q->tab[idx] = NULL;
125 
126 	if (idx == q->head)
127 		choke_zap_head_holes(q);
128 	if (idx == q->tail)
129 		choke_zap_tail_holes(q);
130 
131 	qdisc_qstats_backlog_dec(sch, skb);
132 	qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
133 	qdisc_drop(skb, sch, to_free);
134 	--sch->q.qlen;
135 }
136 
137 struct choke_skb_cb {
138 	u16			classid;
139 	u8			keys_valid;
140 	struct			flow_keys_digest keys;
141 };
142 
143 static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
144 {
145 	qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
146 	return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
147 }
148 
149 static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
150 {
151 	choke_skb_cb(skb)->classid = classid;
152 }
153 
154 /*
155  * Compare flow of two packets
156  *  Returns true only if source and destination address and port match.
157  *          false for special cases
158  */
159 static bool choke_match_flow(struct sk_buff *skb1,
160 			     struct sk_buff *skb2)
161 {
162 	struct flow_keys temp;
163 
164 	if (skb1->protocol != skb2->protocol)
165 		return false;
166 
167 	if (!choke_skb_cb(skb1)->keys_valid) {
168 		choke_skb_cb(skb1)->keys_valid = 1;
169 		skb_flow_dissect_flow_keys(skb1, &temp, 0);
170 		make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp);
171 	}
172 
173 	if (!choke_skb_cb(skb2)->keys_valid) {
174 		choke_skb_cb(skb2)->keys_valid = 1;
175 		skb_flow_dissect_flow_keys(skb2, &temp, 0);
176 		make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp);
177 	}
178 
179 	return !memcmp(&choke_skb_cb(skb1)->keys,
180 		       &choke_skb_cb(skb2)->keys,
181 		       sizeof(choke_skb_cb(skb1)->keys));
182 }
183 
184 /*
185  * Select a packet at random from queue
186  * HACK: since queue can have holes from previous deletion; retry several
187  *   times to find a random skb but then just give up and return the head
188  * Will return NULL if queue is empty (q->head == q->tail)
189  */
190 static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
191 					 unsigned int *pidx)
192 {
193 	struct sk_buff *skb;
194 	int retrys = 3;
195 
196 	do {
197 		*pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask;
198 		skb = q->tab[*pidx];
199 		if (skb)
200 			return skb;
201 	} while (--retrys > 0);
202 
203 	return q->tab[*pidx = q->head];
204 }
205 
206 /*
207  * Compare new packet with random packet in queue
208  * returns true if matched and sets *pidx
209  */
210 static bool choke_match_random(const struct choke_sched_data *q,
211 			       struct sk_buff *nskb,
212 			       unsigned int *pidx)
213 {
214 	struct sk_buff *oskb;
215 
216 	if (q->head == q->tail)
217 		return false;
218 
219 	oskb = choke_peek_random(q, pidx);
220 	return choke_match_flow(oskb, nskb);
221 }
222 
223 static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch,
224 			 struct sk_buff **to_free)
225 {
226 	struct choke_sched_data *q = qdisc_priv(sch);
227 	const struct red_parms *p = &q->parms;
228 
229 	choke_skb_cb(skb)->keys_valid = 0;
230 	/* Compute average queue usage (see RED) */
231 	q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
232 	if (red_is_idling(&q->vars))
233 		red_end_of_idle_period(&q->vars);
234 
235 	/* Is queue small? */
236 	if (q->vars.qavg <= p->qth_min)
237 		q->vars.qcount = -1;
238 	else {
239 		unsigned int idx;
240 
241 		/* Draw a packet at random from queue and compare flow */
242 		if (choke_match_random(q, skb, &idx)) {
243 			q->stats.matched++;
244 			choke_drop_by_idx(sch, idx, to_free);
245 			goto congestion_drop;
246 		}
247 
248 		/* Queue is large, always mark/drop */
249 		if (q->vars.qavg > p->qth_max) {
250 			q->vars.qcount = -1;
251 
252 			qdisc_qstats_overlimit(sch);
253 			if (use_harddrop(q) || !use_ecn(q) ||
254 			    !INET_ECN_set_ce(skb)) {
255 				q->stats.forced_drop++;
256 				goto congestion_drop;
257 			}
258 
259 			q->stats.forced_mark++;
260 		} else if (++q->vars.qcount) {
261 			if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
262 				q->vars.qcount = 0;
263 				q->vars.qR = red_random(p);
264 
265 				qdisc_qstats_overlimit(sch);
266 				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
267 					q->stats.prob_drop++;
268 					goto congestion_drop;
269 				}
270 
271 				q->stats.prob_mark++;
272 			}
273 		} else
274 			q->vars.qR = red_random(p);
275 	}
276 
277 	/* Admit new packet */
278 	if (sch->q.qlen < q->limit) {
279 		q->tab[q->tail] = skb;
280 		q->tail = (q->tail + 1) & q->tab_mask;
281 		++sch->q.qlen;
282 		qdisc_qstats_backlog_inc(sch, skb);
283 		return NET_XMIT_SUCCESS;
284 	}
285 
286 	q->stats.pdrop++;
287 	return qdisc_drop(skb, sch, to_free);
288 
289 congestion_drop:
290 	qdisc_drop(skb, sch, to_free);
291 	return NET_XMIT_CN;
292 }
293 
294 static struct sk_buff *choke_dequeue(struct Qdisc *sch)
295 {
296 	struct choke_sched_data *q = qdisc_priv(sch);
297 	struct sk_buff *skb;
298 
299 	if (q->head == q->tail) {
300 		if (!red_is_idling(&q->vars))
301 			red_start_of_idle_period(&q->vars);
302 		return NULL;
303 	}
304 
305 	skb = q->tab[q->head];
306 	q->tab[q->head] = NULL;
307 	choke_zap_head_holes(q);
308 	--sch->q.qlen;
309 	qdisc_qstats_backlog_dec(sch, skb);
310 	qdisc_bstats_update(sch, skb);
311 
312 	return skb;
313 }
314 
315 static void choke_reset(struct Qdisc *sch)
316 {
317 	struct choke_sched_data *q = qdisc_priv(sch);
318 
319 	while (q->head != q->tail) {
320 		struct sk_buff *skb = q->tab[q->head];
321 
322 		q->head = (q->head + 1) & q->tab_mask;
323 		if (!skb)
324 			continue;
325 		rtnl_qdisc_drop(skb, sch);
326 	}
327 
328 	sch->q.qlen = 0;
329 	sch->qstats.backlog = 0;
330 	memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *));
331 	q->head = q->tail = 0;
332 	red_restart(&q->vars);
333 }
334 
335 static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
336 	[TCA_CHOKE_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
337 	[TCA_CHOKE_STAB]	= { .len = RED_STAB_SIZE },
338 	[TCA_CHOKE_MAX_P]	= { .type = NLA_U32 },
339 };
340 
341 
342 static void choke_free(void *addr)
343 {
344 	kvfree(addr);
345 }
346 
347 static int choke_change(struct Qdisc *sch, struct nlattr *opt)
348 {
349 	struct choke_sched_data *q = qdisc_priv(sch);
350 	struct nlattr *tb[TCA_CHOKE_MAX + 1];
351 	const struct tc_red_qopt *ctl;
352 	int err;
353 	struct sk_buff **old = NULL;
354 	unsigned int mask;
355 	u32 max_P;
356 
357 	if (opt == NULL)
358 		return -EINVAL;
359 
360 	err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy, NULL);
361 	if (err < 0)
362 		return err;
363 
364 	if (tb[TCA_CHOKE_PARMS] == NULL ||
365 	    tb[TCA_CHOKE_STAB] == NULL)
366 		return -EINVAL;
367 
368 	max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
369 
370 	ctl = nla_data(tb[TCA_CHOKE_PARMS]);
371 
372 	if (ctl->limit > CHOKE_MAX_QUEUE)
373 		return -EINVAL;
374 
375 	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
376 	if (mask != q->tab_mask) {
377 		struct sk_buff **ntab;
378 
379 		ntab = kvmalloc_array((mask + 1), sizeof(struct sk_buff *), GFP_KERNEL | __GFP_ZERO);
380 		if (!ntab)
381 			return -ENOMEM;
382 
383 		sch_tree_lock(sch);
384 		old = q->tab;
385 		if (old) {
386 			unsigned int oqlen = sch->q.qlen, tail = 0;
387 			unsigned dropped = 0;
388 
389 			while (q->head != q->tail) {
390 				struct sk_buff *skb = q->tab[q->head];
391 
392 				q->head = (q->head + 1) & q->tab_mask;
393 				if (!skb)
394 					continue;
395 				if (tail < mask) {
396 					ntab[tail++] = skb;
397 					continue;
398 				}
399 				dropped += qdisc_pkt_len(skb);
400 				qdisc_qstats_backlog_dec(sch, skb);
401 				--sch->q.qlen;
402 				rtnl_qdisc_drop(skb, sch);
403 			}
404 			qdisc_tree_reduce_backlog(sch, oqlen - sch->q.qlen, dropped);
405 			q->head = 0;
406 			q->tail = tail;
407 		}
408 
409 		q->tab_mask = mask;
410 		q->tab = ntab;
411 	} else
412 		sch_tree_lock(sch);
413 
414 	q->flags = ctl->flags;
415 	q->limit = ctl->limit;
416 
417 	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
418 		      ctl->Plog, ctl->Scell_log,
419 		      nla_data(tb[TCA_CHOKE_STAB]),
420 		      max_P);
421 	red_set_vars(&q->vars);
422 
423 	if (q->head == q->tail)
424 		red_end_of_idle_period(&q->vars);
425 
426 	sch_tree_unlock(sch);
427 	choke_free(old);
428 	return 0;
429 }
430 
431 static int choke_init(struct Qdisc *sch, struct nlattr *opt)
432 {
433 	return choke_change(sch, opt);
434 }
435 
436 static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
437 {
438 	struct choke_sched_data *q = qdisc_priv(sch);
439 	struct nlattr *opts = NULL;
440 	struct tc_red_qopt opt = {
441 		.limit		= q->limit,
442 		.flags		= q->flags,
443 		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
444 		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
445 		.Wlog		= q->parms.Wlog,
446 		.Plog		= q->parms.Plog,
447 		.Scell_log	= q->parms.Scell_log,
448 	};
449 
450 	opts = nla_nest_start(skb, TCA_OPTIONS);
451 	if (opts == NULL)
452 		goto nla_put_failure;
453 
454 	if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
455 	    nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
456 		goto nla_put_failure;
457 	return nla_nest_end(skb, opts);
458 
459 nla_put_failure:
460 	nla_nest_cancel(skb, opts);
461 	return -EMSGSIZE;
462 }
463 
464 static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
465 {
466 	struct choke_sched_data *q = qdisc_priv(sch);
467 	struct tc_choke_xstats st = {
468 		.early	= q->stats.prob_drop + q->stats.forced_drop,
469 		.marked	= q->stats.prob_mark + q->stats.forced_mark,
470 		.pdrop	= q->stats.pdrop,
471 		.other	= q->stats.other,
472 		.matched = q->stats.matched,
473 	};
474 
475 	return gnet_stats_copy_app(d, &st, sizeof(st));
476 }
477 
478 static void choke_destroy(struct Qdisc *sch)
479 {
480 	struct choke_sched_data *q = qdisc_priv(sch);
481 
482 	choke_free(q->tab);
483 }
484 
485 static struct sk_buff *choke_peek_head(struct Qdisc *sch)
486 {
487 	struct choke_sched_data *q = qdisc_priv(sch);
488 
489 	return (q->head != q->tail) ? q->tab[q->head] : NULL;
490 }
491 
492 static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
493 	.id		=	"choke",
494 	.priv_size	=	sizeof(struct choke_sched_data),
495 
496 	.enqueue	=	choke_enqueue,
497 	.dequeue	=	choke_dequeue,
498 	.peek		=	choke_peek_head,
499 	.init		=	choke_init,
500 	.destroy	=	choke_destroy,
501 	.reset		=	choke_reset,
502 	.change		=	choke_change,
503 	.dump		=	choke_dump,
504 	.dump_stats	=	choke_dump_stats,
505 	.owner		=	THIS_MODULE,
506 };
507 
508 static int __init choke_module_init(void)
509 {
510 	return register_qdisc(&choke_qdisc_ops);
511 }
512 
513 static void __exit choke_module_exit(void)
514 {
515 	unregister_qdisc(&choke_qdisc_ops);
516 }
517 
518 module_init(choke_module_init)
519 module_exit(choke_module_exit)
520 
521 MODULE_LICENSE("GPL");
522