xref: /linux/net/sched/sch_tbf.c (revision 643e2e259c2b25a2af0ae4c23c6e16586d9fd19c)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_tbf.c	Token Bucket Filter queue.
4  *
5  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  *		Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
7  *						 original idea by Martin Devera
8  */
9 
10 #include <linux/module.h>
11 #include <linux/types.h>
12 #include <linux/kernel.h>
13 #include <linux/string.h>
14 #include <linux/errno.h>
15 #include <linux/skbuff.h>
16 #include <net/gso.h>
17 #include <net/netlink.h>
18 #include <net/sch_generic.h>
19 #include <net/pkt_cls.h>
20 #include <net/pkt_sched.h>
21 
22 
23 /*	Simple Token Bucket Filter.
24 	=======================================
25 
26 	SOURCE.
27 	-------
28 
29 	None.
30 
31 	Description.
32 	------------
33 
34 	A data flow obeys TBF with rate R and depth B, if for any
35 	time interval t_i...t_f the number of transmitted bits
36 	does not exceed B + R*(t_f-t_i).
37 
38 	Packetized version of this definition:
39 	The sequence of packets of sizes s_i served at moments t_i
40 	obeys TBF, if for any i<=k:
41 
42 	s_i+....+s_k <= B + R*(t_k - t_i)
43 
44 	Algorithm.
45 	----------
46 
47 	Let N(t_i) be B/R initially and N(t) grow continuously with time as:
48 
49 	N(t+delta) = min{B/R, N(t) + delta}
50 
51 	If the first packet in queue has length S, it may be
52 	transmitted only at the time t_* when S/R <= N(t_*),
53 	and in this case N(t) jumps:
54 
55 	N(t_* + 0) = N(t_* - 0) - S/R.
56 
57 
58 
59 	Actually, QoS requires two TBF to be applied to a data stream.
60 	One of them controls steady state burst size, another
61 	one with rate P (peak rate) and depth M (equal to link MTU)
62 	limits bursts at a smaller time scale.
63 
64 	It is easy to see that P>R, and B>M. If P is infinity, this double
65 	TBF is equivalent to a single one.
66 
67 	When TBF works in reshaping mode, latency is estimated as:
68 
69 	lat = max ((L-B)/R, (L-M)/P)
70 
71 
72 	NOTES.
73 	------
74 
75 	If TBF throttles, it starts a watchdog timer, which will wake it up
76 	when it is ready to transmit.
77 	Note that the minimal timer resolution is 1/HZ.
78 	If no new packets arrive during this period,
79 	or if the device is not awaken by EOI for some previous packet,
80 	TBF can stop its activity for 1/HZ.
81 
82 
83 	This means, that with depth B, the maximal rate is
84 
85 	R_crit = B*HZ
86 
87 	F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
88 
89 	Note that the peak rate TBF is much more tough: with MTU 1500
90 	P_crit = 150Kbytes/sec. So, if you need greater peak
91 	rates, use alpha with HZ=1000 :-)
92 
93 	With classful TBF, limit is just kept for backwards compatibility.
94 	It is passed to the default bfifo qdisc - if the inner qdisc is
95 	changed the limit is not effective anymore.
96 */
97 
98 struct tbf_sched_data {
99 /* Parameters */
100 	u32		limit;		/* Maximal length of backlog: bytes */
101 	u32		max_size;
102 	s64		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */
103 	s64		mtu;
104 	struct psched_ratecfg rate;
105 	struct psched_ratecfg peak;
106 
107 /* Variables */
108 	s64	tokens;			/* Current number of B tokens */
109 	s64	ptokens;		/* Current number of P tokens */
110 	s64	t_c;			/* Time check-point */
111 	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */
112 	struct qdisc_watchdog watchdog;	/* Watchdog timer */
113 };
114 
115 
116 /* Time to Length, convert time in ns to length in bytes
117  * to determinate how many bytes can be sent in given time.
118  */
119 static u64 psched_ns_t2l(const struct psched_ratecfg *r,
120 			 u64 time_in_ns)
121 {
122 	/* The formula is :
123 	 * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
124 	 */
125 	u64 len = time_in_ns * r->rate_bytes_ps;
126 
127 	do_div(len, NSEC_PER_SEC);
128 
129 	if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
130 		do_div(len, 53);
131 		len = len * 48;
132 	}
133 
134 	if (len > r->overhead)
135 		len -= r->overhead;
136 	else
137 		len = 0;
138 
139 	return len;
140 }
141 
142 static void tbf_offload_change(struct Qdisc *sch)
143 {
144 	struct tbf_sched_data *q = qdisc_priv(sch);
145 	struct net_device *dev = qdisc_dev(sch);
146 	struct tc_tbf_qopt_offload qopt;
147 
148 	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
149 		return;
150 
151 	qopt.command = TC_TBF_REPLACE;
152 	qopt.handle = sch->handle;
153 	qopt.parent = sch->parent;
154 	qopt.replace_params.rate = q->rate;
155 	qopt.replace_params.max_size = q->max_size;
156 	qopt.replace_params.qstats = &sch->qstats;
157 
158 	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt);
159 }
160 
161 static void tbf_offload_destroy(struct Qdisc *sch)
162 {
163 	struct net_device *dev = qdisc_dev(sch);
164 	struct tc_tbf_qopt_offload qopt;
165 
166 	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
167 		return;
168 
169 	qopt.command = TC_TBF_DESTROY;
170 	qopt.handle = sch->handle;
171 	qopt.parent = sch->parent;
172 	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt);
173 }
174 
175 static int tbf_offload_dump(struct Qdisc *sch)
176 {
177 	struct tc_tbf_qopt_offload qopt;
178 
179 	qopt.command = TC_TBF_STATS;
180 	qopt.handle = sch->handle;
181 	qopt.parent = sch->parent;
182 	qopt.stats.bstats = &sch->bstats;
183 	qopt.stats.qstats = &sch->qstats;
184 
185 	return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_TBF, &qopt);
186 }
187 
188 static void tbf_offload_graft(struct Qdisc *sch, struct Qdisc *new,
189 			      struct Qdisc *old, struct netlink_ext_ack *extack)
190 {
191 	struct tc_tbf_qopt_offload graft_offload = {
192 		.handle		= sch->handle,
193 		.parent		= sch->parent,
194 		.child_handle	= new->handle,
195 		.command	= TC_TBF_GRAFT,
196 	};
197 
198 	qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, old,
199 				   TC_SETUP_QDISC_TBF, &graft_offload, extack);
200 }
201 
202 /* GSO packet is too big, segment it so that tbf can transmit
203  * each segment in time
204  */
205 static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
206 		       struct sk_buff **to_free)
207 {
208 	struct tbf_sched_data *q = qdisc_priv(sch);
209 	struct sk_buff *segs, *nskb;
210 	netdev_features_t features = netif_skb_features(skb);
211 	unsigned int len = 0, prev_len = qdisc_pkt_len(skb), seg_len;
212 	int ret, nb;
213 
214 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
215 
216 	if (IS_ERR_OR_NULL(segs))
217 		return qdisc_drop(skb, sch, to_free);
218 
219 	nb = 0;
220 	skb_list_walk_safe(segs, segs, nskb) {
221 		skb_mark_not_on_list(segs);
222 		seg_len = segs->len;
223 		qdisc_skb_cb(segs)->pkt_len = seg_len;
224 		ret = qdisc_enqueue(segs, q->qdisc, to_free);
225 		if (ret != NET_XMIT_SUCCESS) {
226 			if (net_xmit_drop_count(ret))
227 				qdisc_qstats_drop(sch);
228 		} else {
229 			nb++;
230 			len += seg_len;
231 		}
232 	}
233 	sch->q.qlen += nb;
234 	sch->qstats.backlog += len;
235 	if (nb > 0) {
236 		qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
237 		consume_skb(skb);
238 		return NET_XMIT_SUCCESS;
239 	}
240 
241 	kfree_skb(skb);
242 	return NET_XMIT_DROP;
243 }
244 
245 static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
246 		       struct sk_buff **to_free)
247 {
248 	struct tbf_sched_data *q = qdisc_priv(sch);
249 	unsigned int len = qdisc_pkt_len(skb);
250 	int ret;
251 
252 	if (qdisc_pkt_len(skb) > q->max_size) {
253 		if (skb_is_gso(skb) &&
254 		    skb_gso_validate_mac_len(skb, q->max_size))
255 			return tbf_segment(skb, sch, to_free);
256 		return qdisc_drop(skb, sch, to_free);
257 	}
258 	ret = qdisc_enqueue(skb, q->qdisc, to_free);
259 	if (ret != NET_XMIT_SUCCESS) {
260 		if (net_xmit_drop_count(ret))
261 			qdisc_qstats_drop(sch);
262 		return ret;
263 	}
264 
265 	sch->qstats.backlog += len;
266 	sch->q.qlen++;
267 	return NET_XMIT_SUCCESS;
268 }
269 
270 static bool tbf_peak_present(const struct tbf_sched_data *q)
271 {
272 	return q->peak.rate_bytes_ps;
273 }
274 
275 static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
276 {
277 	struct tbf_sched_data *q = qdisc_priv(sch);
278 	struct sk_buff *skb;
279 
280 	skb = q->qdisc->ops->peek(q->qdisc);
281 
282 	if (skb) {
283 		s64 now;
284 		s64 toks;
285 		s64 ptoks = 0;
286 		unsigned int len = qdisc_pkt_len(skb);
287 
288 		now = ktime_get_ns();
289 		toks = min_t(s64, now - q->t_c, q->buffer);
290 
291 		if (tbf_peak_present(q)) {
292 			ptoks = toks + q->ptokens;
293 			if (ptoks > q->mtu)
294 				ptoks = q->mtu;
295 			ptoks -= (s64) psched_l2t_ns(&q->peak, len);
296 		}
297 		toks += q->tokens;
298 		if (toks > q->buffer)
299 			toks = q->buffer;
300 		toks -= (s64) psched_l2t_ns(&q->rate, len);
301 
302 		if ((toks|ptoks) >= 0) {
303 			skb = qdisc_dequeue_peeked(q->qdisc);
304 			if (unlikely(!skb))
305 				return NULL;
306 
307 			q->t_c = now;
308 			q->tokens = toks;
309 			q->ptokens = ptoks;
310 			qdisc_qstats_backlog_dec(sch, skb);
311 			sch->q.qlen--;
312 			qdisc_bstats_update(sch, skb);
313 			return skb;
314 		}
315 
316 		qdisc_watchdog_schedule_ns(&q->watchdog,
317 					   now + max_t(long, -toks, -ptoks));
318 
319 		/* Maybe we have a shorter packet in the queue,
320 		   which can be sent now. It sounds cool,
321 		   but, however, this is wrong in principle.
322 		   We MUST NOT reorder packets under these circumstances.
323 
324 		   Really, if we split the flow into independent
325 		   subflows, it would be a very good solution.
326 		   This is the main idea of all FQ algorithms
327 		   (cf. CSZ, HPFQ, HFSC)
328 		 */
329 
330 		qdisc_qstats_overlimit(sch);
331 	}
332 	return NULL;
333 }
334 
335 static void tbf_reset(struct Qdisc *sch)
336 {
337 	struct tbf_sched_data *q = qdisc_priv(sch);
338 
339 	qdisc_reset(q->qdisc);
340 	q->t_c = ktime_get_ns();
341 	q->tokens = q->buffer;
342 	q->ptokens = q->mtu;
343 	qdisc_watchdog_cancel(&q->watchdog);
344 }
345 
346 static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
347 	[TCA_TBF_PARMS]	= { .len = sizeof(struct tc_tbf_qopt) },
348 	[TCA_TBF_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
349 	[TCA_TBF_PTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
350 	[TCA_TBF_RATE64]	= { .type = NLA_U64 },
351 	[TCA_TBF_PRATE64]	= { .type = NLA_U64 },
352 	[TCA_TBF_BURST] = { .type = NLA_U32 },
353 	[TCA_TBF_PBURST] = { .type = NLA_U32 },
354 };
355 
356 static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
357 		      struct netlink_ext_ack *extack)
358 {
359 	int err;
360 	struct tbf_sched_data *q = qdisc_priv(sch);
361 	struct nlattr *tb[TCA_TBF_MAX + 1];
362 	struct tc_tbf_qopt *qopt;
363 	struct Qdisc *child = NULL;
364 	struct Qdisc *old = NULL;
365 	struct psched_ratecfg rate;
366 	struct psched_ratecfg peak;
367 	u64 max_size;
368 	s64 buffer, mtu;
369 	u64 rate64 = 0, prate64 = 0;
370 
371 	err = nla_parse_nested_deprecated(tb, TCA_TBF_MAX, opt, tbf_policy,
372 					  NULL);
373 	if (err < 0)
374 		return err;
375 
376 	err = -EINVAL;
377 	if (tb[TCA_TBF_PARMS] == NULL)
378 		goto done;
379 
380 	qopt = nla_data(tb[TCA_TBF_PARMS]);
381 	if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
382 		qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
383 					      tb[TCA_TBF_RTAB],
384 					      NULL));
385 
386 	if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
387 			qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
388 						      tb[TCA_TBF_PTAB],
389 						      NULL));
390 
391 	buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
392 	mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
393 
394 	if (tb[TCA_TBF_RATE64])
395 		rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
396 	psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
397 
398 	if (tb[TCA_TBF_BURST]) {
399 		max_size = nla_get_u32(tb[TCA_TBF_BURST]);
400 		buffer = psched_l2t_ns(&rate, max_size);
401 	} else {
402 		max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
403 	}
404 
405 	if (qopt->peakrate.rate) {
406 		if (tb[TCA_TBF_PRATE64])
407 			prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
408 		psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
409 		if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
410 			pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
411 					peak.rate_bytes_ps, rate.rate_bytes_ps);
412 			err = -EINVAL;
413 			goto done;
414 		}
415 
416 		if (tb[TCA_TBF_PBURST]) {
417 			u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
418 			max_size = min_t(u32, max_size, pburst);
419 			mtu = psched_l2t_ns(&peak, pburst);
420 		} else {
421 			max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
422 		}
423 	} else {
424 		memset(&peak, 0, sizeof(peak));
425 	}
426 
427 	if (max_size < psched_mtu(qdisc_dev(sch)))
428 		pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
429 				    max_size, qdisc_dev(sch)->name,
430 				    psched_mtu(qdisc_dev(sch)));
431 
432 	if (!max_size) {
433 		err = -EINVAL;
434 		goto done;
435 	}
436 
437 	if (q->qdisc != &noop_qdisc) {
438 		err = fifo_set_limit(q->qdisc, qopt->limit);
439 		if (err)
440 			goto done;
441 	} else if (qopt->limit > 0) {
442 		child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit,
443 					 extack);
444 		if (IS_ERR(child)) {
445 			err = PTR_ERR(child);
446 			goto done;
447 		}
448 
449 		/* child is fifo, no need to check for noop_qdisc */
450 		qdisc_hash_add(child, true);
451 	}
452 
453 	sch_tree_lock(sch);
454 	if (child) {
455 		qdisc_tree_flush_backlog(q->qdisc);
456 		old = q->qdisc;
457 		q->qdisc = child;
458 	}
459 	q->limit = qopt->limit;
460 	if (tb[TCA_TBF_PBURST])
461 		q->mtu = mtu;
462 	else
463 		q->mtu = PSCHED_TICKS2NS(qopt->mtu);
464 	q->max_size = max_size;
465 	if (tb[TCA_TBF_BURST])
466 		q->buffer = buffer;
467 	else
468 		q->buffer = PSCHED_TICKS2NS(qopt->buffer);
469 	q->tokens = q->buffer;
470 	q->ptokens = q->mtu;
471 
472 	memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
473 	memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
474 
475 	sch_tree_unlock(sch);
476 	qdisc_put(old);
477 	err = 0;
478 
479 	tbf_offload_change(sch);
480 done:
481 	return err;
482 }
483 
484 static int tbf_init(struct Qdisc *sch, struct nlattr *opt,
485 		    struct netlink_ext_ack *extack)
486 {
487 	struct tbf_sched_data *q = qdisc_priv(sch);
488 
489 	qdisc_watchdog_init(&q->watchdog, sch);
490 	q->qdisc = &noop_qdisc;
491 
492 	if (!opt)
493 		return -EINVAL;
494 
495 	q->t_c = ktime_get_ns();
496 
497 	return tbf_change(sch, opt, extack);
498 }
499 
500 static void tbf_destroy(struct Qdisc *sch)
501 {
502 	struct tbf_sched_data *q = qdisc_priv(sch);
503 
504 	qdisc_watchdog_cancel(&q->watchdog);
505 	tbf_offload_destroy(sch);
506 	qdisc_put(q->qdisc);
507 }
508 
509 static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
510 {
511 	struct tbf_sched_data *q = qdisc_priv(sch);
512 	struct nlattr *nest;
513 	struct tc_tbf_qopt opt;
514 	int err;
515 
516 	err = tbf_offload_dump(sch);
517 	if (err)
518 		return err;
519 
520 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
521 	if (nest == NULL)
522 		goto nla_put_failure;
523 
524 	opt.limit = q->limit;
525 	psched_ratecfg_getrate(&opt.rate, &q->rate);
526 	if (tbf_peak_present(q))
527 		psched_ratecfg_getrate(&opt.peakrate, &q->peak);
528 	else
529 		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
530 	opt.mtu = PSCHED_NS2TICKS(q->mtu);
531 	opt.buffer = PSCHED_NS2TICKS(q->buffer);
532 	if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
533 		goto nla_put_failure;
534 	if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
535 	    nla_put_u64_64bit(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps,
536 			      TCA_TBF_PAD))
537 		goto nla_put_failure;
538 	if (tbf_peak_present(q) &&
539 	    q->peak.rate_bytes_ps >= (1ULL << 32) &&
540 	    nla_put_u64_64bit(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps,
541 			      TCA_TBF_PAD))
542 		goto nla_put_failure;
543 
544 	return nla_nest_end(skb, nest);
545 
546 nla_put_failure:
547 	nla_nest_cancel(skb, nest);
548 	return -1;
549 }
550 
551 static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
552 			  struct sk_buff *skb, struct tcmsg *tcm)
553 {
554 	struct tbf_sched_data *q = qdisc_priv(sch);
555 
556 	tcm->tcm_handle |= TC_H_MIN(1);
557 	tcm->tcm_info = q->qdisc->handle;
558 
559 	return 0;
560 }
561 
562 static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
563 		     struct Qdisc **old, struct netlink_ext_ack *extack)
564 {
565 	struct tbf_sched_data *q = qdisc_priv(sch);
566 
567 	if (new == NULL)
568 		new = &noop_qdisc;
569 
570 	*old = qdisc_replace(sch, new, &q->qdisc);
571 
572 	tbf_offload_graft(sch, new, *old, extack);
573 	return 0;
574 }
575 
576 static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
577 {
578 	struct tbf_sched_data *q = qdisc_priv(sch);
579 	return q->qdisc;
580 }
581 
582 static unsigned long tbf_find(struct Qdisc *sch, u32 classid)
583 {
584 	return 1;
585 }
586 
587 static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
588 {
589 	if (!walker->stop) {
590 		tc_qdisc_stats_dump(sch, 1, walker);
591 	}
592 }
593 
594 static const struct Qdisc_class_ops tbf_class_ops = {
595 	.graft		=	tbf_graft,
596 	.leaf		=	tbf_leaf,
597 	.find		=	tbf_find,
598 	.walk		=	tbf_walk,
599 	.dump		=	tbf_dump_class,
600 };
601 
602 static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
603 	.next		=	NULL,
604 	.cl_ops		=	&tbf_class_ops,
605 	.id		=	"tbf",
606 	.priv_size	=	sizeof(struct tbf_sched_data),
607 	.enqueue	=	tbf_enqueue,
608 	.dequeue	=	tbf_dequeue,
609 	.peek		=	qdisc_peek_dequeued,
610 	.init		=	tbf_init,
611 	.reset		=	tbf_reset,
612 	.destroy	=	tbf_destroy,
613 	.change		=	tbf_change,
614 	.dump		=	tbf_dump,
615 	.owner		=	THIS_MODULE,
616 };
617 MODULE_ALIAS_NET_SCH("tbf");
618 
619 static int __init tbf_module_init(void)
620 {
621 	return register_qdisc(&tbf_qdisc_ops);
622 }
623 
624 static void __exit tbf_module_exit(void)
625 {
626 	unregister_qdisc(&tbf_qdisc_ops);
627 }
628 module_init(tbf_module_init)
629 module_exit(tbf_module_exit)
630 MODULE_LICENSE("GPL");
631 MODULE_DESCRIPTION("Token Bucket Filter qdisc");
632