xref: /linux/net/sched/sch_choke.c (revision 3932b9ca55b0be314a36d3e84faff3e823c081f5)
1 /*
2  * net/sched/sch_choke.c	CHOKE scheduler
3  *
4  * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
5  * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * version 2 as published by the Free Software Foundation.
10  *
11  */
12 
13 #include <linux/module.h>
14 #include <linux/types.h>
15 #include <linux/kernel.h>
16 #include <linux/skbuff.h>
17 #include <linux/vmalloc.h>
18 #include <net/pkt_sched.h>
19 #include <net/inet_ecn.h>
20 #include <net/red.h>
21 #include <net/flow_keys.h>
22 
23 /*
24    CHOKe stateless AQM for fair bandwidth allocation
25    =================================================
26 
27    CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
28    unresponsive flows) is a variant of RED that penalizes misbehaving flows but
29    maintains no flow state. The difference from RED is an additional step
30    during the enqueuing process. If average queue size is over the
31    low threshold (qmin), a packet is chosen at random from the queue.
32    If both the new and chosen packet are from the same flow, both
33    are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
34    needs to access packets in queue randomly. It has a minimal class
35    interface to allow overriding the builtin flow classifier with
36    filters.
37 
38    Source:
39    R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
40    Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
41    IEEE INFOCOM, 2000.
42 
43    A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
44    Characteristics", IEEE/ACM Transactions on Networking, 2004
45 
46  */
47 
48 /* Upper bound on size of sk_buff table (packets) */
49 #define CHOKE_MAX_QUEUE	(128*1024 - 1)
50 
51 struct choke_sched_data {
52 /* Parameters */
53 	u32		 limit;
54 	unsigned char	 flags;
55 
56 	struct red_parms parms;
57 
58 /* Variables */
59 	struct red_vars  vars;
60 	struct tcf_proto *filter_list;
61 	struct {
62 		u32	prob_drop;	/* Early probability drops */
63 		u32	prob_mark;	/* Early probability marks */
64 		u32	forced_drop;	/* Forced drops, qavg > max_thresh */
65 		u32	forced_mark;	/* Forced marks, qavg > max_thresh */
66 		u32	pdrop;          /* Drops due to queue limits */
67 		u32	other;          /* Drops due to drop() calls */
68 		u32	matched;	/* Drops to flow match */
69 	} stats;
70 
71 	unsigned int	 head;
72 	unsigned int	 tail;
73 
74 	unsigned int	 tab_mask; /* size - 1 */
75 
76 	struct sk_buff **tab;
77 };
78 
79 /* number of elements in queue including holes */
80 static unsigned int choke_len(const struct choke_sched_data *q)
81 {
82 	return (q->tail - q->head) & q->tab_mask;
83 }
84 
85 /* Is ECN parameter configured */
86 static int use_ecn(const struct choke_sched_data *q)
87 {
88 	return q->flags & TC_RED_ECN;
89 }
90 
91 /* Should packets over max just be dropped (versus marked) */
92 static int use_harddrop(const struct choke_sched_data *q)
93 {
94 	return q->flags & TC_RED_HARDDROP;
95 }
96 
97 /* Move head pointer forward to skip over holes */
98 static void choke_zap_head_holes(struct choke_sched_data *q)
99 {
100 	do {
101 		q->head = (q->head + 1) & q->tab_mask;
102 		if (q->head == q->tail)
103 			break;
104 	} while (q->tab[q->head] == NULL);
105 }
106 
107 /* Move tail pointer backwards to reuse holes */
108 static void choke_zap_tail_holes(struct choke_sched_data *q)
109 {
110 	do {
111 		q->tail = (q->tail - 1) & q->tab_mask;
112 		if (q->head == q->tail)
113 			break;
114 	} while (q->tab[q->tail] == NULL);
115 }
116 
117 /* Drop packet from queue array by creating a "hole" */
118 static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
119 {
120 	struct choke_sched_data *q = qdisc_priv(sch);
121 	struct sk_buff *skb = q->tab[idx];
122 
123 	q->tab[idx] = NULL;
124 
125 	if (idx == q->head)
126 		choke_zap_head_holes(q);
127 	if (idx == q->tail)
128 		choke_zap_tail_holes(q);
129 
130 	sch->qstats.backlog -= qdisc_pkt_len(skb);
131 	qdisc_drop(skb, sch);
132 	qdisc_tree_decrease_qlen(sch, 1);
133 	--sch->q.qlen;
134 }
135 
136 /* private part of skb->cb[] that a qdisc is allowed to use
137  * is limited to QDISC_CB_PRIV_LEN bytes.
138  * As a flow key might be too large, we store a part of it only.
139  */
140 #define CHOKE_K_LEN min_t(u32, sizeof(struct flow_keys), QDISC_CB_PRIV_LEN - 3)
141 
142 struct choke_skb_cb {
143 	u16			classid;
144 	u8			keys_valid;
145 	u8			keys[QDISC_CB_PRIV_LEN - 3];
146 };
147 
148 static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
149 {
150 	qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
151 	return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
152 }
153 
154 static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
155 {
156 	choke_skb_cb(skb)->classid = classid;
157 }
158 
159 static u16 choke_get_classid(const struct sk_buff *skb)
160 {
161 	return choke_skb_cb(skb)->classid;
162 }
163 
164 /*
165  * Compare flow of two packets
166  *  Returns true only if source and destination address and port match.
167  *          false for special cases
168  */
169 static bool choke_match_flow(struct sk_buff *skb1,
170 			     struct sk_buff *skb2)
171 {
172 	struct flow_keys temp;
173 
174 	if (skb1->protocol != skb2->protocol)
175 		return false;
176 
177 	if (!choke_skb_cb(skb1)->keys_valid) {
178 		choke_skb_cb(skb1)->keys_valid = 1;
179 		skb_flow_dissect(skb1, &temp);
180 		memcpy(&choke_skb_cb(skb1)->keys, &temp, CHOKE_K_LEN);
181 	}
182 
183 	if (!choke_skb_cb(skb2)->keys_valid) {
184 		choke_skb_cb(skb2)->keys_valid = 1;
185 		skb_flow_dissect(skb2, &temp);
186 		memcpy(&choke_skb_cb(skb2)->keys, &temp, CHOKE_K_LEN);
187 	}
188 
189 	return !memcmp(&choke_skb_cb(skb1)->keys,
190 		       &choke_skb_cb(skb2)->keys,
191 		       CHOKE_K_LEN);
192 }
193 
194 /*
195  * Classify flow using either:
196  *  1. pre-existing classification result in skb
197  *  2. fast internal classification
198  *  3. use TC filter based classification
199  */
200 static bool choke_classify(struct sk_buff *skb,
201 			   struct Qdisc *sch, int *qerr)
202 
203 {
204 	struct choke_sched_data *q = qdisc_priv(sch);
205 	struct tcf_result res;
206 	int result;
207 
208 	result = tc_classify(skb, q->filter_list, &res);
209 	if (result >= 0) {
210 #ifdef CONFIG_NET_CLS_ACT
211 		switch (result) {
212 		case TC_ACT_STOLEN:
213 		case TC_ACT_QUEUED:
214 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
215 		case TC_ACT_SHOT:
216 			return false;
217 		}
218 #endif
219 		choke_set_classid(skb, TC_H_MIN(res.classid));
220 		return true;
221 	}
222 
223 	return false;
224 }
225 
226 /*
227  * Select a packet at random from queue
228  * HACK: since queue can have holes from previous deletion; retry several
229  *   times to find a random skb but then just give up and return the head
230  * Will return NULL if queue is empty (q->head == q->tail)
231  */
232 static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
233 					 unsigned int *pidx)
234 {
235 	struct sk_buff *skb;
236 	int retrys = 3;
237 
238 	do {
239 		*pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask;
240 		skb = q->tab[*pidx];
241 		if (skb)
242 			return skb;
243 	} while (--retrys > 0);
244 
245 	return q->tab[*pidx = q->head];
246 }
247 
248 /*
249  * Compare new packet with random packet in queue
250  * returns true if matched and sets *pidx
251  */
252 static bool choke_match_random(const struct choke_sched_data *q,
253 			       struct sk_buff *nskb,
254 			       unsigned int *pidx)
255 {
256 	struct sk_buff *oskb;
257 
258 	if (q->head == q->tail)
259 		return false;
260 
261 	oskb = choke_peek_random(q, pidx);
262 	if (q->filter_list)
263 		return choke_get_classid(nskb) == choke_get_classid(oskb);
264 
265 	return choke_match_flow(oskb, nskb);
266 }
267 
268 static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
269 {
270 	struct choke_sched_data *q = qdisc_priv(sch);
271 	const struct red_parms *p = &q->parms;
272 	int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
273 
274 	if (q->filter_list) {
275 		/* If using external classifiers, get result and record it. */
276 		if (!choke_classify(skb, sch, &ret))
277 			goto other_drop;	/* Packet was eaten by filter */
278 	}
279 
280 	choke_skb_cb(skb)->keys_valid = 0;
281 	/* Compute average queue usage (see RED) */
282 	q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
283 	if (red_is_idling(&q->vars))
284 		red_end_of_idle_period(&q->vars);
285 
286 	/* Is queue small? */
287 	if (q->vars.qavg <= p->qth_min)
288 		q->vars.qcount = -1;
289 	else {
290 		unsigned int idx;
291 
292 		/* Draw a packet at random from queue and compare flow */
293 		if (choke_match_random(q, skb, &idx)) {
294 			q->stats.matched++;
295 			choke_drop_by_idx(sch, idx);
296 			goto congestion_drop;
297 		}
298 
299 		/* Queue is large, always mark/drop */
300 		if (q->vars.qavg > p->qth_max) {
301 			q->vars.qcount = -1;
302 
303 			sch->qstats.overlimits++;
304 			if (use_harddrop(q) || !use_ecn(q) ||
305 			    !INET_ECN_set_ce(skb)) {
306 				q->stats.forced_drop++;
307 				goto congestion_drop;
308 			}
309 
310 			q->stats.forced_mark++;
311 		} else if (++q->vars.qcount) {
312 			if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
313 				q->vars.qcount = 0;
314 				q->vars.qR = red_random(p);
315 
316 				sch->qstats.overlimits++;
317 				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
318 					q->stats.prob_drop++;
319 					goto congestion_drop;
320 				}
321 
322 				q->stats.prob_mark++;
323 			}
324 		} else
325 			q->vars.qR = red_random(p);
326 	}
327 
328 	/* Admit new packet */
329 	if (sch->q.qlen < q->limit) {
330 		q->tab[q->tail] = skb;
331 		q->tail = (q->tail + 1) & q->tab_mask;
332 		++sch->q.qlen;
333 		sch->qstats.backlog += qdisc_pkt_len(skb);
334 		return NET_XMIT_SUCCESS;
335 	}
336 
337 	q->stats.pdrop++;
338 	return qdisc_drop(skb, sch);
339 
340 congestion_drop:
341 	qdisc_drop(skb, sch);
342 	return NET_XMIT_CN;
343 
344 other_drop:
345 	if (ret & __NET_XMIT_BYPASS)
346 		sch->qstats.drops++;
347 	kfree_skb(skb);
348 	return ret;
349 }
350 
351 static struct sk_buff *choke_dequeue(struct Qdisc *sch)
352 {
353 	struct choke_sched_data *q = qdisc_priv(sch);
354 	struct sk_buff *skb;
355 
356 	if (q->head == q->tail) {
357 		if (!red_is_idling(&q->vars))
358 			red_start_of_idle_period(&q->vars);
359 		return NULL;
360 	}
361 
362 	skb = q->tab[q->head];
363 	q->tab[q->head] = NULL;
364 	choke_zap_head_holes(q);
365 	--sch->q.qlen;
366 	sch->qstats.backlog -= qdisc_pkt_len(skb);
367 	qdisc_bstats_update(sch, skb);
368 
369 	return skb;
370 }
371 
372 static unsigned int choke_drop(struct Qdisc *sch)
373 {
374 	struct choke_sched_data *q = qdisc_priv(sch);
375 	unsigned int len;
376 
377 	len = qdisc_queue_drop(sch);
378 	if (len > 0)
379 		q->stats.other++;
380 	else {
381 		if (!red_is_idling(&q->vars))
382 			red_start_of_idle_period(&q->vars);
383 	}
384 
385 	return len;
386 }
387 
388 static void choke_reset(struct Qdisc *sch)
389 {
390 	struct choke_sched_data *q = qdisc_priv(sch);
391 
392 	red_restart(&q->vars);
393 }
394 
395 static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
396 	[TCA_CHOKE_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
397 	[TCA_CHOKE_STAB]	= { .len = RED_STAB_SIZE },
398 	[TCA_CHOKE_MAX_P]	= { .type = NLA_U32 },
399 };
400 
401 
402 static void choke_free(void *addr)
403 {
404 	kvfree(addr);
405 }
406 
407 static int choke_change(struct Qdisc *sch, struct nlattr *opt)
408 {
409 	struct choke_sched_data *q = qdisc_priv(sch);
410 	struct nlattr *tb[TCA_CHOKE_MAX + 1];
411 	const struct tc_red_qopt *ctl;
412 	int err;
413 	struct sk_buff **old = NULL;
414 	unsigned int mask;
415 	u32 max_P;
416 
417 	if (opt == NULL)
418 		return -EINVAL;
419 
420 	err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
421 	if (err < 0)
422 		return err;
423 
424 	if (tb[TCA_CHOKE_PARMS] == NULL ||
425 	    tb[TCA_CHOKE_STAB] == NULL)
426 		return -EINVAL;
427 
428 	max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
429 
430 	ctl = nla_data(tb[TCA_CHOKE_PARMS]);
431 
432 	if (ctl->limit > CHOKE_MAX_QUEUE)
433 		return -EINVAL;
434 
435 	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
436 	if (mask != q->tab_mask) {
437 		struct sk_buff **ntab;
438 
439 		ntab = kcalloc(mask + 1, sizeof(struct sk_buff *),
440 			       GFP_KERNEL | __GFP_NOWARN);
441 		if (!ntab)
442 			ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
443 		if (!ntab)
444 			return -ENOMEM;
445 
446 		sch_tree_lock(sch);
447 		old = q->tab;
448 		if (old) {
449 			unsigned int oqlen = sch->q.qlen, tail = 0;
450 
451 			while (q->head != q->tail) {
452 				struct sk_buff *skb = q->tab[q->head];
453 
454 				q->head = (q->head + 1) & q->tab_mask;
455 				if (!skb)
456 					continue;
457 				if (tail < mask) {
458 					ntab[tail++] = skb;
459 					continue;
460 				}
461 				sch->qstats.backlog -= qdisc_pkt_len(skb);
462 				--sch->q.qlen;
463 				qdisc_drop(skb, sch);
464 			}
465 			qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen);
466 			q->head = 0;
467 			q->tail = tail;
468 		}
469 
470 		q->tab_mask = mask;
471 		q->tab = ntab;
472 	} else
473 		sch_tree_lock(sch);
474 
475 	q->flags = ctl->flags;
476 	q->limit = ctl->limit;
477 
478 	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
479 		      ctl->Plog, ctl->Scell_log,
480 		      nla_data(tb[TCA_CHOKE_STAB]),
481 		      max_P);
482 	red_set_vars(&q->vars);
483 
484 	if (q->head == q->tail)
485 		red_end_of_idle_period(&q->vars);
486 
487 	sch_tree_unlock(sch);
488 	choke_free(old);
489 	return 0;
490 }
491 
492 static int choke_init(struct Qdisc *sch, struct nlattr *opt)
493 {
494 	return choke_change(sch, opt);
495 }
496 
497 static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
498 {
499 	struct choke_sched_data *q = qdisc_priv(sch);
500 	struct nlattr *opts = NULL;
501 	struct tc_red_qopt opt = {
502 		.limit		= q->limit,
503 		.flags		= q->flags,
504 		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
505 		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
506 		.Wlog		= q->parms.Wlog,
507 		.Plog		= q->parms.Plog,
508 		.Scell_log	= q->parms.Scell_log,
509 	};
510 
511 	opts = nla_nest_start(skb, TCA_OPTIONS);
512 	if (opts == NULL)
513 		goto nla_put_failure;
514 
515 	if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
516 	    nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
517 		goto nla_put_failure;
518 	return nla_nest_end(skb, opts);
519 
520 nla_put_failure:
521 	nla_nest_cancel(skb, opts);
522 	return -EMSGSIZE;
523 }
524 
525 static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
526 {
527 	struct choke_sched_data *q = qdisc_priv(sch);
528 	struct tc_choke_xstats st = {
529 		.early	= q->stats.prob_drop + q->stats.forced_drop,
530 		.marked	= q->stats.prob_mark + q->stats.forced_mark,
531 		.pdrop	= q->stats.pdrop,
532 		.other	= q->stats.other,
533 		.matched = q->stats.matched,
534 	};
535 
536 	return gnet_stats_copy_app(d, &st, sizeof(st));
537 }
538 
539 static void choke_destroy(struct Qdisc *sch)
540 {
541 	struct choke_sched_data *q = qdisc_priv(sch);
542 
543 	tcf_destroy_chain(&q->filter_list);
544 	choke_free(q->tab);
545 }
546 
547 static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg)
548 {
549 	return NULL;
550 }
551 
552 static unsigned long choke_get(struct Qdisc *sch, u32 classid)
553 {
554 	return 0;
555 }
556 
557 static void choke_put(struct Qdisc *q, unsigned long cl)
558 {
559 }
560 
561 static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
562 				u32 classid)
563 {
564 	return 0;
565 }
566 
567 static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl)
568 {
569 	struct choke_sched_data *q = qdisc_priv(sch);
570 
571 	if (cl)
572 		return NULL;
573 	return &q->filter_list;
574 }
575 
576 static int choke_dump_class(struct Qdisc *sch, unsigned long cl,
577 			  struct sk_buff *skb, struct tcmsg *tcm)
578 {
579 	tcm->tcm_handle |= TC_H_MIN(cl);
580 	return 0;
581 }
582 
583 static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg)
584 {
585 	if (!arg->stop) {
586 		if (arg->fn(sch, 1, arg) < 0) {
587 			arg->stop = 1;
588 			return;
589 		}
590 		arg->count++;
591 	}
592 }
593 
594 static const struct Qdisc_class_ops choke_class_ops = {
595 	.leaf		=	choke_leaf,
596 	.get		=	choke_get,
597 	.put		=	choke_put,
598 	.tcf_chain	=	choke_find_tcf,
599 	.bind_tcf	=	choke_bind,
600 	.unbind_tcf	=	choke_put,
601 	.dump		=	choke_dump_class,
602 	.walk		=	choke_walk,
603 };
604 
605 static struct sk_buff *choke_peek_head(struct Qdisc *sch)
606 {
607 	struct choke_sched_data *q = qdisc_priv(sch);
608 
609 	return (q->head != q->tail) ? q->tab[q->head] : NULL;
610 }
611 
612 static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
613 	.id		=	"choke",
614 	.priv_size	=	sizeof(struct choke_sched_data),
615 
616 	.enqueue	=	choke_enqueue,
617 	.dequeue	=	choke_dequeue,
618 	.peek		=	choke_peek_head,
619 	.drop		=	choke_drop,
620 	.init		=	choke_init,
621 	.destroy	=	choke_destroy,
622 	.reset		=	choke_reset,
623 	.change		=	choke_change,
624 	.dump		=	choke_dump,
625 	.dump_stats	=	choke_dump_stats,
626 	.owner		=	THIS_MODULE,
627 };
628 
629 static int __init choke_module_init(void)
630 {
631 	return register_qdisc(&choke_qdisc_ops);
632 }
633 
634 static void __exit choke_module_exit(void)
635 {
636 	unregister_qdisc(&choke_qdisc_ops);
637 }
638 
639 module_init(choke_module_init)
640 module_exit(choke_module_exit)
641 
642 MODULE_LICENSE("GPL");
643