xref: /linux/net/sched/sch_fq_pie.c (revision 173b0b5b0e865348684c02bd9cb1d22b5d46e458)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Flow Queue PIE discipline
3  *
4  * Copyright (C) 2019 Mohit P. Tahiliani <tahiliani@nitk.edu.in>
5  * Copyright (C) 2019 Sachin D. Patil <sdp.sachin@gmail.com>
6  * Copyright (C) 2019 V. Saicharan <vsaicharan1998@gmail.com>
7  * Copyright (C) 2019 Mohit Bhasi <mohitbhasi1998@gmail.com>
8  * Copyright (C) 2019 Leslie Monis <lesliemonis@gmail.com>
9  * Copyright (C) 2019 Gautam Ramakrishnan <gautamramk@gmail.com>
10  */
11 
12 #include <linux/jhash.h>
13 #include <linux/module.h>
14 #include <linux/sizes.h>
15 #include <linux/vmalloc.h>
16 #include <net/pkt_cls.h>
17 #include <net/pie.h>
18 
19 /* Flow Queue PIE
20  *
21  * Principles:
22  *   - Packets are classified on flows.
23  *   - This is a Stochastic model (as we use a hash, several flows might
24  *                                 be hashed to the same slot)
25  *   - Each flow has a PIE managed queue.
26  *   - Flows are linked onto two (Round Robin) lists,
27  *     so that new flows have priority on old ones.
28  *   - For a given flow, packets are not reordered.
29  *   - Drops during enqueue only.
30  *   - ECN capability is off by default.
31  *   - ECN threshold (if ECN is enabled) is at 10% by default.
32  *   - Uses timestamps to calculate queue delay by default.
33  */
34 
35 /**
36  * struct fq_pie_flow - contains data for each flow
37  * @vars:	pie vars associated with the flow
38  * @deficit:	number of remaining byte credits
39  * @backlog:	size of data in the flow
40  * @qlen:	number of packets in the flow
41  * @flowchain:	flowchain for the flow
42  * @head:	first packet in the flow
43  * @tail:	last packet in the flow
44  */
45 struct fq_pie_flow {
46 	struct pie_vars vars;
47 	s32 deficit;
48 	u32 backlog;
49 	u32 qlen;
50 	struct list_head flowchain;
51 	struct sk_buff *head;
52 	struct sk_buff *tail;
53 };
54 
55 struct fq_pie_sched_data {
56 	struct tcf_proto __rcu *filter_list; /* optional external classifier */
57 	struct tcf_block *block;
58 	struct fq_pie_flow *flows;
59 	struct Qdisc *sch;
60 	struct list_head old_flows;
61 	struct list_head new_flows;
62 	struct pie_params p_params;
63 	u32 ecn_prob;
64 	u32 flows_cnt;
65 	u32 flows_cursor;
66 	u32 quantum;
67 	u32 memory_limit;
68 	u32 new_flow_count;
69 	u32 memory_usage;
70 	u32 overmemory;
71 	struct pie_stats stats;
72 	struct timer_list adapt_timer;
73 };
74 
75 static unsigned int fq_pie_hash(const struct fq_pie_sched_data *q,
76 				struct sk_buff *skb)
77 {
78 	return reciprocal_scale(skb_get_hash(skb), q->flows_cnt);
79 }
80 
81 static unsigned int fq_pie_classify(struct sk_buff *skb, struct Qdisc *sch,
82 				    int *qerr)
83 {
84 	struct fq_pie_sched_data *q = qdisc_priv(sch);
85 	struct tcf_proto *filter;
86 	struct tcf_result res;
87 	int result;
88 
89 	if (TC_H_MAJ(skb->priority) == sch->handle &&
90 	    TC_H_MIN(skb->priority) > 0 &&
91 	    TC_H_MIN(skb->priority) <= q->flows_cnt)
92 		return TC_H_MIN(skb->priority);
93 
94 	filter = rcu_dereference_bh(q->filter_list);
95 	if (!filter)
96 		return fq_pie_hash(q, skb) + 1;
97 
98 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
99 	result = tcf_classify(skb, NULL, filter, &res, false);
100 	if (result >= 0) {
101 #ifdef CONFIG_NET_CLS_ACT
102 		switch (result) {
103 		case TC_ACT_STOLEN:
104 		case TC_ACT_QUEUED:
105 		case TC_ACT_TRAP:
106 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
107 			fallthrough;
108 		case TC_ACT_SHOT:
109 			return 0;
110 		}
111 #endif
112 		if (TC_H_MIN(res.classid) <= q->flows_cnt)
113 			return TC_H_MIN(res.classid);
114 	}
115 	return 0;
116 }
117 
118 /* add skb to flow queue (tail add) */
119 static inline void flow_queue_add(struct fq_pie_flow *flow,
120 				  struct sk_buff *skb)
121 {
122 	if (!flow->head)
123 		flow->head = skb;
124 	else
125 		flow->tail->next = skb;
126 	flow->tail = skb;
127 	skb->next = NULL;
128 }
129 
130 static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
131 				struct sk_buff **to_free)
132 {
133 	struct fq_pie_sched_data *q = qdisc_priv(sch);
134 	struct fq_pie_flow *sel_flow;
135 	int ret;
136 	u8 memory_limited = false;
137 	u8 enqueue = false;
138 	u32 pkt_len;
139 	u32 idx;
140 
141 	/* Classifies packet into corresponding flow */
142 	idx = fq_pie_classify(skb, sch, &ret);
143 	if (idx == 0) {
144 		if (ret & __NET_XMIT_BYPASS)
145 			qdisc_qstats_drop(sch);
146 		__qdisc_drop(skb, to_free);
147 		return ret;
148 	}
149 	idx--;
150 
151 	sel_flow = &q->flows[idx];
152 	/* Checks whether adding a new packet would exceed memory limit */
153 	get_pie_cb(skb)->mem_usage = skb->truesize;
154 	memory_limited = q->memory_usage > q->memory_limit + skb->truesize;
155 
156 	/* Checks if the qdisc is full */
157 	if (unlikely(qdisc_qlen(sch) >= sch->limit)) {
158 		q->stats.overlimit++;
159 		goto out;
160 	} else if (unlikely(memory_limited)) {
161 		q->overmemory++;
162 	}
163 
164 	if (!pie_drop_early(sch, &q->p_params, &sel_flow->vars,
165 			    sel_flow->backlog, skb->len)) {
166 		enqueue = true;
167 	} else if (q->p_params.ecn &&
168 		   sel_flow->vars.prob <= (MAX_PROB / 100) * q->ecn_prob &&
169 		   INET_ECN_set_ce(skb)) {
170 		/* If packet is ecn capable, mark it if drop probability
171 		 * is lower than the parameter ecn_prob, else drop it.
172 		 */
173 		q->stats.ecn_mark++;
174 		enqueue = true;
175 	}
176 	if (enqueue) {
177 		/* Set enqueue time only when dq_rate_estimator is disabled. */
178 		if (!q->p_params.dq_rate_estimator)
179 			pie_set_enqueue_time(skb);
180 
181 		pkt_len = qdisc_pkt_len(skb);
182 		q->stats.packets_in++;
183 		q->memory_usage += skb->truesize;
184 		sch->qstats.backlog += pkt_len;
185 		sch->q.qlen++;
186 		flow_queue_add(sel_flow, skb);
187 		if (list_empty(&sel_flow->flowchain)) {
188 			list_add_tail(&sel_flow->flowchain, &q->new_flows);
189 			q->new_flow_count++;
190 			sel_flow->deficit = q->quantum;
191 			sel_flow->qlen = 0;
192 			sel_flow->backlog = 0;
193 		}
194 		sel_flow->qlen++;
195 		sel_flow->backlog += pkt_len;
196 		return NET_XMIT_SUCCESS;
197 	}
198 out:
199 	q->stats.dropped++;
200 	sel_flow->vars.accu_prob = 0;
201 	__qdisc_drop(skb, to_free);
202 	qdisc_qstats_drop(sch);
203 	return NET_XMIT_CN;
204 }
205 
206 static const struct netlink_range_validation fq_pie_q_range = {
207 	.min = 1,
208 	.max = 1 << 20,
209 };
210 
211 static const struct nla_policy fq_pie_policy[TCA_FQ_PIE_MAX + 1] = {
212 	[TCA_FQ_PIE_LIMIT]		= {.type = NLA_U32},
213 	[TCA_FQ_PIE_FLOWS]		= {.type = NLA_U32},
214 	[TCA_FQ_PIE_TARGET]		= {.type = NLA_U32},
215 	[TCA_FQ_PIE_TUPDATE]		= {.type = NLA_U32},
216 	[TCA_FQ_PIE_ALPHA]		= {.type = NLA_U32},
217 	[TCA_FQ_PIE_BETA]		= {.type = NLA_U32},
218 	[TCA_FQ_PIE_QUANTUM]		=
219 			NLA_POLICY_FULL_RANGE(NLA_U32, &fq_pie_q_range),
220 	[TCA_FQ_PIE_MEMORY_LIMIT]	= {.type = NLA_U32},
221 	[TCA_FQ_PIE_ECN_PROB]		= {.type = NLA_U32},
222 	[TCA_FQ_PIE_ECN]		= {.type = NLA_U32},
223 	[TCA_FQ_PIE_BYTEMODE]		= {.type = NLA_U32},
224 	[TCA_FQ_PIE_DQ_RATE_ESTIMATOR]	= {.type = NLA_U32},
225 };
226 
227 static inline struct sk_buff *dequeue_head(struct fq_pie_flow *flow)
228 {
229 	struct sk_buff *skb = flow->head;
230 
231 	flow->head = skb->next;
232 	skb->next = NULL;
233 	return skb;
234 }
235 
236 static struct sk_buff *fq_pie_qdisc_dequeue(struct Qdisc *sch)
237 {
238 	struct fq_pie_sched_data *q = qdisc_priv(sch);
239 	struct sk_buff *skb = NULL;
240 	struct fq_pie_flow *flow;
241 	struct list_head *head;
242 	u32 pkt_len;
243 
244 begin:
245 	head = &q->new_flows;
246 	if (list_empty(head)) {
247 		head = &q->old_flows;
248 		if (list_empty(head))
249 			return NULL;
250 	}
251 
252 	flow = list_first_entry(head, struct fq_pie_flow, flowchain);
253 	/* Flow has exhausted all its credits */
254 	if (flow->deficit <= 0) {
255 		flow->deficit += q->quantum;
256 		list_move_tail(&flow->flowchain, &q->old_flows);
257 		goto begin;
258 	}
259 
260 	if (flow->head) {
261 		skb = dequeue_head(flow);
262 		pkt_len = qdisc_pkt_len(skb);
263 		sch->qstats.backlog -= pkt_len;
264 		sch->q.qlen--;
265 		qdisc_bstats_update(sch, skb);
266 	}
267 
268 	if (!skb) {
269 		/* force a pass through old_flows to prevent starvation */
270 		if (head == &q->new_flows && !list_empty(&q->old_flows))
271 			list_move_tail(&flow->flowchain, &q->old_flows);
272 		else
273 			list_del_init(&flow->flowchain);
274 		goto begin;
275 	}
276 
277 	flow->qlen--;
278 	flow->deficit -= pkt_len;
279 	flow->backlog -= pkt_len;
280 	q->memory_usage -= get_pie_cb(skb)->mem_usage;
281 	pie_process_dequeue(skb, &q->p_params, &flow->vars, flow->backlog);
282 	return skb;
283 }
284 
285 static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt,
286 			 struct netlink_ext_ack *extack)
287 {
288 	struct fq_pie_sched_data *q = qdisc_priv(sch);
289 	struct nlattr *tb[TCA_FQ_PIE_MAX + 1];
290 	unsigned int len_dropped = 0;
291 	unsigned int num_dropped = 0;
292 	int err;
293 
294 	err = nla_parse_nested(tb, TCA_FQ_PIE_MAX, opt, fq_pie_policy, extack);
295 	if (err < 0)
296 		return err;
297 
298 	sch_tree_lock(sch);
299 	if (tb[TCA_FQ_PIE_LIMIT]) {
300 		u32 limit = nla_get_u32(tb[TCA_FQ_PIE_LIMIT]);
301 
302 		q->p_params.limit = limit;
303 		sch->limit = limit;
304 	}
305 	if (tb[TCA_FQ_PIE_FLOWS]) {
306 		if (q->flows) {
307 			NL_SET_ERR_MSG_MOD(extack,
308 					   "Number of flows cannot be changed");
309 			goto flow_error;
310 		}
311 		q->flows_cnt = nla_get_u32(tb[TCA_FQ_PIE_FLOWS]);
312 		if (!q->flows_cnt || q->flows_cnt > 65536) {
313 			NL_SET_ERR_MSG_MOD(extack,
314 					   "Number of flows must range in [1..65536]");
315 			goto flow_error;
316 		}
317 	}
318 
319 	/* convert from microseconds to pschedtime */
320 	if (tb[TCA_FQ_PIE_TARGET]) {
321 		/* target is in us */
322 		u32 target = nla_get_u32(tb[TCA_FQ_PIE_TARGET]);
323 
324 		/* convert to pschedtime */
325 		q->p_params.target =
326 			PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC);
327 	}
328 
329 	/* tupdate is in jiffies */
330 	if (tb[TCA_FQ_PIE_TUPDATE])
331 		q->p_params.tupdate =
332 			usecs_to_jiffies(nla_get_u32(tb[TCA_FQ_PIE_TUPDATE]));
333 
334 	if (tb[TCA_FQ_PIE_ALPHA])
335 		q->p_params.alpha = nla_get_u32(tb[TCA_FQ_PIE_ALPHA]);
336 
337 	if (tb[TCA_FQ_PIE_BETA])
338 		q->p_params.beta = nla_get_u32(tb[TCA_FQ_PIE_BETA]);
339 
340 	if (tb[TCA_FQ_PIE_QUANTUM])
341 		q->quantum = nla_get_u32(tb[TCA_FQ_PIE_QUANTUM]);
342 
343 	if (tb[TCA_FQ_PIE_MEMORY_LIMIT])
344 		q->memory_limit = nla_get_u32(tb[TCA_FQ_PIE_MEMORY_LIMIT]);
345 
346 	if (tb[TCA_FQ_PIE_ECN_PROB])
347 		q->ecn_prob = nla_get_u32(tb[TCA_FQ_PIE_ECN_PROB]);
348 
349 	if (tb[TCA_FQ_PIE_ECN])
350 		q->p_params.ecn = nla_get_u32(tb[TCA_FQ_PIE_ECN]);
351 
352 	if (tb[TCA_FQ_PIE_BYTEMODE])
353 		q->p_params.bytemode = nla_get_u32(tb[TCA_FQ_PIE_BYTEMODE]);
354 
355 	if (tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR])
356 		q->p_params.dq_rate_estimator =
357 			nla_get_u32(tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR]);
358 
359 	/* Drop excess packets if new limit is lower */
360 	while (sch->q.qlen > sch->limit) {
361 		struct sk_buff *skb = fq_pie_qdisc_dequeue(sch);
362 
363 		len_dropped += qdisc_pkt_len(skb);
364 		num_dropped += 1;
365 		rtnl_kfree_skbs(skb, skb);
366 	}
367 	qdisc_tree_reduce_backlog(sch, num_dropped, len_dropped);
368 
369 	sch_tree_unlock(sch);
370 	return 0;
371 
372 flow_error:
373 	sch_tree_unlock(sch);
374 	return -EINVAL;
375 }
376 
377 static void fq_pie_timer(struct timer_list *t)
378 {
379 	struct fq_pie_sched_data *q = from_timer(q, t, adapt_timer);
380 	unsigned long next, tupdate;
381 	struct Qdisc *sch = q->sch;
382 	spinlock_t *root_lock; /* to lock qdisc for probability calculations */
383 	int max_cnt, i;
384 
385 	rcu_read_lock();
386 	root_lock = qdisc_lock(qdisc_root_sleeping(sch));
387 	spin_lock(root_lock);
388 
389 	/* Limit this expensive loop to 2048 flows per round. */
390 	max_cnt = min_t(int, q->flows_cnt - q->flows_cursor, 2048);
391 	for (i = 0; i < max_cnt; i++) {
392 		pie_calculate_probability(&q->p_params,
393 					  &q->flows[q->flows_cursor].vars,
394 					  q->flows[q->flows_cursor].backlog);
395 		q->flows_cursor++;
396 	}
397 
398 	tupdate = q->p_params.tupdate;
399 	next = 0;
400 	if (q->flows_cursor >= q->flows_cnt) {
401 		q->flows_cursor = 0;
402 		next = tupdate;
403 	}
404 	if (tupdate)
405 		mod_timer(&q->adapt_timer, jiffies + next);
406 	spin_unlock(root_lock);
407 	rcu_read_unlock();
408 }
409 
410 static int fq_pie_init(struct Qdisc *sch, struct nlattr *opt,
411 		       struct netlink_ext_ack *extack)
412 {
413 	struct fq_pie_sched_data *q = qdisc_priv(sch);
414 	int err;
415 	u32 idx;
416 
417 	pie_params_init(&q->p_params);
418 	sch->limit = 10 * 1024;
419 	q->p_params.limit = sch->limit;
420 	q->quantum = psched_mtu(qdisc_dev(sch));
421 	q->sch = sch;
422 	q->ecn_prob = 10;
423 	q->flows_cnt = 1024;
424 	q->memory_limit = SZ_32M;
425 
426 	INIT_LIST_HEAD(&q->new_flows);
427 	INIT_LIST_HEAD(&q->old_flows);
428 	timer_setup(&q->adapt_timer, fq_pie_timer, 0);
429 
430 	if (opt) {
431 		err = fq_pie_change(sch, opt, extack);
432 
433 		if (err)
434 			return err;
435 	}
436 
437 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
438 	if (err)
439 		goto init_failure;
440 
441 	q->flows = kvcalloc(q->flows_cnt, sizeof(struct fq_pie_flow),
442 			    GFP_KERNEL);
443 	if (!q->flows) {
444 		err = -ENOMEM;
445 		goto init_failure;
446 	}
447 	for (idx = 0; idx < q->flows_cnt; idx++) {
448 		struct fq_pie_flow *flow = q->flows + idx;
449 
450 		INIT_LIST_HEAD(&flow->flowchain);
451 		pie_vars_init(&flow->vars);
452 	}
453 
454 	mod_timer(&q->adapt_timer, jiffies + HZ / 2);
455 
456 	return 0;
457 
458 init_failure:
459 	q->flows_cnt = 0;
460 
461 	return err;
462 }
463 
464 static int fq_pie_dump(struct Qdisc *sch, struct sk_buff *skb)
465 {
466 	struct fq_pie_sched_data *q = qdisc_priv(sch);
467 	struct nlattr *opts;
468 
469 	opts = nla_nest_start(skb, TCA_OPTIONS);
470 	if (!opts)
471 		return -EMSGSIZE;
472 
473 	/* convert target from pschedtime to us */
474 	if (nla_put_u32(skb, TCA_FQ_PIE_LIMIT, sch->limit) ||
475 	    nla_put_u32(skb, TCA_FQ_PIE_FLOWS, q->flows_cnt) ||
476 	    nla_put_u32(skb, TCA_FQ_PIE_TARGET,
477 			((u32)PSCHED_TICKS2NS(q->p_params.target)) /
478 			NSEC_PER_USEC) ||
479 	    nla_put_u32(skb, TCA_FQ_PIE_TUPDATE,
480 			jiffies_to_usecs(q->p_params.tupdate)) ||
481 	    nla_put_u32(skb, TCA_FQ_PIE_ALPHA, q->p_params.alpha) ||
482 	    nla_put_u32(skb, TCA_FQ_PIE_BETA, q->p_params.beta) ||
483 	    nla_put_u32(skb, TCA_FQ_PIE_QUANTUM, q->quantum) ||
484 	    nla_put_u32(skb, TCA_FQ_PIE_MEMORY_LIMIT, q->memory_limit) ||
485 	    nla_put_u32(skb, TCA_FQ_PIE_ECN_PROB, q->ecn_prob) ||
486 	    nla_put_u32(skb, TCA_FQ_PIE_ECN, q->p_params.ecn) ||
487 	    nla_put_u32(skb, TCA_FQ_PIE_BYTEMODE, q->p_params.bytemode) ||
488 	    nla_put_u32(skb, TCA_FQ_PIE_DQ_RATE_ESTIMATOR,
489 			q->p_params.dq_rate_estimator))
490 		goto nla_put_failure;
491 
492 	return nla_nest_end(skb, opts);
493 
494 nla_put_failure:
495 	nla_nest_cancel(skb, opts);
496 	return -EMSGSIZE;
497 }
498 
499 static int fq_pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
500 {
501 	struct fq_pie_sched_data *q = qdisc_priv(sch);
502 	struct tc_fq_pie_xstats st = {
503 		.packets_in	= q->stats.packets_in,
504 		.overlimit	= q->stats.overlimit,
505 		.overmemory	= q->overmemory,
506 		.dropped	= q->stats.dropped,
507 		.ecn_mark	= q->stats.ecn_mark,
508 		.new_flow_count = q->new_flow_count,
509 		.memory_usage   = q->memory_usage,
510 	};
511 	struct list_head *pos;
512 
513 	sch_tree_lock(sch);
514 	list_for_each(pos, &q->new_flows)
515 		st.new_flows_len++;
516 
517 	list_for_each(pos, &q->old_flows)
518 		st.old_flows_len++;
519 	sch_tree_unlock(sch);
520 
521 	return gnet_stats_copy_app(d, &st, sizeof(st));
522 }
523 
524 static void fq_pie_reset(struct Qdisc *sch)
525 {
526 	struct fq_pie_sched_data *q = qdisc_priv(sch);
527 	u32 idx;
528 
529 	INIT_LIST_HEAD(&q->new_flows);
530 	INIT_LIST_HEAD(&q->old_flows);
531 	for (idx = 0; idx < q->flows_cnt; idx++) {
532 		struct fq_pie_flow *flow = q->flows + idx;
533 
534 		/* Removes all packets from flow */
535 		rtnl_kfree_skbs(flow->head, flow->tail);
536 		flow->head = NULL;
537 
538 		INIT_LIST_HEAD(&flow->flowchain);
539 		pie_vars_init(&flow->vars);
540 	}
541 }
542 
543 static void fq_pie_destroy(struct Qdisc *sch)
544 {
545 	struct fq_pie_sched_data *q = qdisc_priv(sch);
546 
547 	tcf_block_put(q->block);
548 	q->p_params.tupdate = 0;
549 	del_timer_sync(&q->adapt_timer);
550 	kvfree(q->flows);
551 }
552 
553 static struct Qdisc_ops fq_pie_qdisc_ops __read_mostly = {
554 	.id		= "fq_pie",
555 	.priv_size	= sizeof(struct fq_pie_sched_data),
556 	.enqueue	= fq_pie_qdisc_enqueue,
557 	.dequeue	= fq_pie_qdisc_dequeue,
558 	.peek		= qdisc_peek_dequeued,
559 	.init		= fq_pie_init,
560 	.destroy	= fq_pie_destroy,
561 	.reset		= fq_pie_reset,
562 	.change		= fq_pie_change,
563 	.dump		= fq_pie_dump,
564 	.dump_stats	= fq_pie_dump_stats,
565 	.owner		= THIS_MODULE,
566 };
567 MODULE_ALIAS_NET_SCH("fq_pie");
568 
569 static int __init fq_pie_module_init(void)
570 {
571 	return register_qdisc(&fq_pie_qdisc_ops);
572 }
573 
574 static void __exit fq_pie_module_exit(void)
575 {
576 	unregister_qdisc(&fq_pie_qdisc_ops);
577 }
578 
579 module_init(fq_pie_module_init);
580 module_exit(fq_pie_module_exit);
581 
582 MODULE_DESCRIPTION("Flow Queue Proportional Integral controller Enhanced (FQ-PIE)");
583 MODULE_AUTHOR("Mohit P. Tahiliani");
584 MODULE_LICENSE("GPL");
585