xref: /linux/net/sched/sch_tbf.c (revision 8f7aa3d3c7323f4ca2768a9e74ebbe359c4f8f88)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_tbf.c	Token Bucket Filter queue.
4  *
5  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  *		Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
7  *						 original idea by Martin Devera
8  */
9 
10 #include <linux/module.h>
11 #include <linux/types.h>
12 #include <linux/kernel.h>
13 #include <linux/string.h>
14 #include <linux/errno.h>
15 #include <linux/skbuff.h>
16 #include <net/gso.h>
17 #include <net/netlink.h>
18 #include <net/sch_generic.h>
19 #include <net/pkt_cls.h>
20 #include <net/pkt_sched.h>
21 
22 
23 /*	Simple Token Bucket Filter.
24 	=======================================
25 
26 	SOURCE.
27 	-------
28 
29 	None.
30 
31 	Description.
32 	------------
33 
34 	A data flow obeys TBF with rate R and depth B, if for any
35 	time interval t_i...t_f the number of transmitted bits
36 	does not exceed B + R*(t_f-t_i).
37 
38 	Packetized version of this definition:
39 	The sequence of packets of sizes s_i served at moments t_i
40 	obeys TBF, if for any i<=k:
41 
42 	s_i+....+s_k <= B + R*(t_k - t_i)
43 
44 	Algorithm.
45 	----------
46 
47 	Let N(t_i) be B/R initially and N(t) grow continuously with time as:
48 
49 	N(t+delta) = min{B/R, N(t) + delta}
50 
51 	If the first packet in queue has length S, it may be
52 	transmitted only at the time t_* when S/R <= N(t_*),
53 	and in this case N(t) jumps:
54 
55 	N(t_* + 0) = N(t_* - 0) - S/R.
56 
57 
58 
59 	Actually, QoS requires two TBF to be applied to a data stream.
60 	One of them controls steady state burst size, another
61 	one with rate P (peak rate) and depth M (equal to link MTU)
62 	limits bursts at a smaller time scale.
63 
64 	It is easy to see that P>R, and B>M. If P is infinity, this double
65 	TBF is equivalent to a single one.
66 
67 	When TBF works in reshaping mode, latency is estimated as:
68 
69 	lat = max ((L-B)/R, (L-M)/P)
70 
71 
72 	NOTES.
73 	------
74 
75 	If TBF throttles, it starts a watchdog timer, which will wake it up
76 	when it is ready to transmit.
77 	Note that the minimal timer resolution is 1/HZ.
78 	If no new packets arrive during this period,
79 	or if the device is not awaken by EOI for some previous packet,
80 	TBF can stop its activity for 1/HZ.
81 
82 
83 	This means, that with depth B, the maximal rate is
84 
85 	R_crit = B*HZ
86 
87 	F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
88 
89 	Note that the peak rate TBF is much more tough: with MTU 1500
90 	P_crit = 150Kbytes/sec. So, if you need greater peak
91 	rates, use alpha with HZ=1000 :-)
92 
93 	With classful TBF, limit is just kept for backwards compatibility.
94 	It is passed to the default bfifo qdisc - if the inner qdisc is
95 	changed the limit is not effective anymore.
96 */
97 
98 struct tbf_sched_data {
99 /* Parameters */
100 	u32		limit;		/* Maximal length of backlog: bytes */
101 	u32		max_size;
102 	s64		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */
103 	s64		mtu;
104 	struct psched_ratecfg rate;
105 	struct psched_ratecfg peak;
106 
107 /* Variables */
108 	s64	tokens;			/* Current number of B tokens */
109 	s64	ptokens;		/* Current number of P tokens */
110 	s64	t_c;			/* Time check-point */
111 	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */
112 	struct qdisc_watchdog watchdog;	/* Watchdog timer */
113 };
114 
115 
116 /* Time to Length, convert time in ns to length in bytes
117  * to determinate how many bytes can be sent in given time.
118  */
119 static u64 psched_ns_t2l(const struct psched_ratecfg *r,
120 			 u64 time_in_ns)
121 {
122 	/* The formula is :
123 	 * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
124 	 */
125 	u64 len = time_in_ns * r->rate_bytes_ps;
126 
127 	do_div(len, NSEC_PER_SEC);
128 
129 	if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
130 		do_div(len, 53);
131 		len = len * 48;
132 	}
133 
134 	if (len > r->overhead)
135 		len -= r->overhead;
136 	else
137 		len = 0;
138 
139 	return len;
140 }
141 
142 static void tbf_offload_change(struct Qdisc *sch)
143 {
144 	struct tbf_sched_data *q = qdisc_priv(sch);
145 	struct net_device *dev = qdisc_dev(sch);
146 	struct tc_tbf_qopt_offload qopt;
147 
148 	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
149 		return;
150 
151 	qopt.command = TC_TBF_REPLACE;
152 	qopt.handle = sch->handle;
153 	qopt.parent = sch->parent;
154 	qopt.replace_params.rate = q->rate;
155 	qopt.replace_params.max_size = q->max_size;
156 	qopt.replace_params.qstats = &sch->qstats;
157 
158 	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt);
159 }
160 
161 static void tbf_offload_destroy(struct Qdisc *sch)
162 {
163 	struct net_device *dev = qdisc_dev(sch);
164 	struct tc_tbf_qopt_offload qopt;
165 
166 	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
167 		return;
168 
169 	qopt.command = TC_TBF_DESTROY;
170 	qopt.handle = sch->handle;
171 	qopt.parent = sch->parent;
172 	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt);
173 }
174 
175 static int tbf_offload_dump(struct Qdisc *sch)
176 {
177 	struct tc_tbf_qopt_offload qopt;
178 
179 	qopt.command = TC_TBF_STATS;
180 	qopt.handle = sch->handle;
181 	qopt.parent = sch->parent;
182 	qopt.stats.bstats = &sch->bstats;
183 	qopt.stats.qstats = &sch->qstats;
184 
185 	return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_TBF, &qopt);
186 }
187 
188 static void tbf_offload_graft(struct Qdisc *sch, struct Qdisc *new,
189 			      struct Qdisc *old, struct netlink_ext_ack *extack)
190 {
191 	struct tc_tbf_qopt_offload graft_offload = {
192 		.handle		= sch->handle,
193 		.parent		= sch->parent,
194 		.child_handle	= new->handle,
195 		.command	= TC_TBF_GRAFT,
196 	};
197 
198 	qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, old,
199 				   TC_SETUP_QDISC_TBF, &graft_offload, extack);
200 }
201 
202 /* GSO packet is too big, segment it so that tbf can transmit
203  * each segment in time
204  */
205 static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
206 		       struct sk_buff **to_free)
207 {
208 	struct tbf_sched_data *q = qdisc_priv(sch);
209 	struct sk_buff *segs, *nskb;
210 	netdev_features_t features = netif_skb_features(skb);
211 	unsigned int len = 0, prev_len = qdisc_pkt_len(skb), seg_len;
212 	int ret, nb;
213 
214 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
215 
216 	if (IS_ERR_OR_NULL(segs))
217 		return qdisc_drop(skb, sch, to_free);
218 
219 	nb = 0;
220 	skb_list_walk_safe(segs, segs, nskb) {
221 		skb_mark_not_on_list(segs);
222 		seg_len = segs->len;
223 		qdisc_skb_cb(segs)->pkt_len = seg_len;
224 		qdisc_skb_cb(segs)->pkt_segs = 1;
225 		ret = qdisc_enqueue(segs, q->qdisc, to_free);
226 		if (ret != NET_XMIT_SUCCESS) {
227 			if (net_xmit_drop_count(ret))
228 				qdisc_qstats_drop(sch);
229 		} else {
230 			nb++;
231 			len += seg_len;
232 		}
233 	}
234 	sch->q.qlen += nb;
235 	sch->qstats.backlog += len;
236 	if (nb > 0) {
237 		qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
238 		consume_skb(skb);
239 		return NET_XMIT_SUCCESS;
240 	}
241 
242 	kfree_skb(skb);
243 	return NET_XMIT_DROP;
244 }
245 
246 static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
247 		       struct sk_buff **to_free)
248 {
249 	struct tbf_sched_data *q = qdisc_priv(sch);
250 	unsigned int len = qdisc_pkt_len(skb);
251 	int ret;
252 
253 	if (qdisc_pkt_len(skb) > q->max_size) {
254 		if (skb_is_gso(skb) &&
255 		    skb_gso_validate_mac_len(skb, q->max_size))
256 			return tbf_segment(skb, sch, to_free);
257 		return qdisc_drop(skb, sch, to_free);
258 	}
259 	ret = qdisc_enqueue(skb, q->qdisc, to_free);
260 	if (ret != NET_XMIT_SUCCESS) {
261 		if (net_xmit_drop_count(ret))
262 			qdisc_qstats_drop(sch);
263 		return ret;
264 	}
265 
266 	sch->qstats.backlog += len;
267 	sch->q.qlen++;
268 	return NET_XMIT_SUCCESS;
269 }
270 
271 static bool tbf_peak_present(const struct tbf_sched_data *q)
272 {
273 	return q->peak.rate_bytes_ps;
274 }
275 
276 static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
277 {
278 	struct tbf_sched_data *q = qdisc_priv(sch);
279 	struct sk_buff *skb;
280 
281 	skb = q->qdisc->ops->peek(q->qdisc);
282 
283 	if (skb) {
284 		s64 now;
285 		s64 toks;
286 		s64 ptoks = 0;
287 		unsigned int len = qdisc_pkt_len(skb);
288 
289 		now = ktime_get_ns();
290 		toks = min_t(s64, now - q->t_c, q->buffer);
291 
292 		if (tbf_peak_present(q)) {
293 			ptoks = toks + q->ptokens;
294 			if (ptoks > q->mtu)
295 				ptoks = q->mtu;
296 			ptoks -= (s64) psched_l2t_ns(&q->peak, len);
297 		}
298 		toks += q->tokens;
299 		if (toks > q->buffer)
300 			toks = q->buffer;
301 		toks -= (s64) psched_l2t_ns(&q->rate, len);
302 
303 		if ((toks|ptoks) >= 0) {
304 			skb = qdisc_dequeue_peeked(q->qdisc);
305 			if (unlikely(!skb))
306 				return NULL;
307 
308 			q->t_c = now;
309 			q->tokens = toks;
310 			q->ptokens = ptoks;
311 			qdisc_qstats_backlog_dec(sch, skb);
312 			sch->q.qlen--;
313 			qdisc_bstats_update(sch, skb);
314 			return skb;
315 		}
316 
317 		qdisc_watchdog_schedule_ns(&q->watchdog,
318 					   now + max_t(long, -toks, -ptoks));
319 
320 		/* Maybe we have a shorter packet in the queue,
321 		   which can be sent now. It sounds cool,
322 		   but, however, this is wrong in principle.
323 		   We MUST NOT reorder packets under these circumstances.
324 
325 		   Really, if we split the flow into independent
326 		   subflows, it would be a very good solution.
327 		   This is the main idea of all FQ algorithms
328 		   (cf. CSZ, HPFQ, HFSC)
329 		 */
330 
331 		qdisc_qstats_overlimit(sch);
332 	}
333 	return NULL;
334 }
335 
336 static void tbf_reset(struct Qdisc *sch)
337 {
338 	struct tbf_sched_data *q = qdisc_priv(sch);
339 
340 	qdisc_reset(q->qdisc);
341 	q->t_c = ktime_get_ns();
342 	q->tokens = q->buffer;
343 	q->ptokens = q->mtu;
344 	qdisc_watchdog_cancel(&q->watchdog);
345 }
346 
347 static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
348 	[TCA_TBF_PARMS]	= { .len = sizeof(struct tc_tbf_qopt) },
349 	[TCA_TBF_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
350 	[TCA_TBF_PTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
351 	[TCA_TBF_RATE64]	= { .type = NLA_U64 },
352 	[TCA_TBF_PRATE64]	= { .type = NLA_U64 },
353 	[TCA_TBF_BURST] = { .type = NLA_U32 },
354 	[TCA_TBF_PBURST] = { .type = NLA_U32 },
355 };
356 
357 static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
358 		      struct netlink_ext_ack *extack)
359 {
360 	int err;
361 	struct tbf_sched_data *q = qdisc_priv(sch);
362 	struct nlattr *tb[TCA_TBF_MAX + 1];
363 	struct tc_tbf_qopt *qopt;
364 	struct Qdisc *child = NULL;
365 	struct Qdisc *old = NULL;
366 	struct psched_ratecfg rate;
367 	struct psched_ratecfg peak;
368 	u64 max_size;
369 	s64 buffer, mtu;
370 	u64 rate64 = 0, prate64 = 0;
371 
372 	err = nla_parse_nested_deprecated(tb, TCA_TBF_MAX, opt, tbf_policy,
373 					  NULL);
374 	if (err < 0)
375 		return err;
376 
377 	err = -EINVAL;
378 	if (tb[TCA_TBF_PARMS] == NULL)
379 		goto done;
380 
381 	qopt = nla_data(tb[TCA_TBF_PARMS]);
382 	if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
383 		qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
384 					      tb[TCA_TBF_RTAB],
385 					      NULL));
386 
387 	if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
388 			qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
389 						      tb[TCA_TBF_PTAB],
390 						      NULL));
391 
392 	buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
393 	mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
394 
395 	if (tb[TCA_TBF_RATE64])
396 		rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
397 	psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
398 
399 	if (tb[TCA_TBF_BURST]) {
400 		max_size = nla_get_u32(tb[TCA_TBF_BURST]);
401 		buffer = psched_l2t_ns(&rate, max_size);
402 	} else {
403 		max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
404 	}
405 
406 	if (qopt->peakrate.rate) {
407 		if (tb[TCA_TBF_PRATE64])
408 			prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
409 		psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
410 		if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
411 			pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
412 					peak.rate_bytes_ps, rate.rate_bytes_ps);
413 			err = -EINVAL;
414 			goto done;
415 		}
416 
417 		if (tb[TCA_TBF_PBURST]) {
418 			u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
419 			max_size = min_t(u32, max_size, pburst);
420 			mtu = psched_l2t_ns(&peak, pburst);
421 		} else {
422 			max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
423 		}
424 	} else {
425 		memset(&peak, 0, sizeof(peak));
426 	}
427 
428 	if (max_size < psched_mtu(qdisc_dev(sch)))
429 		pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
430 				    max_size, qdisc_dev(sch)->name,
431 				    psched_mtu(qdisc_dev(sch)));
432 
433 	if (!max_size) {
434 		err = -EINVAL;
435 		goto done;
436 	}
437 
438 	if (q->qdisc != &noop_qdisc) {
439 		err = fifo_set_limit(q->qdisc, qopt->limit);
440 		if (err)
441 			goto done;
442 	} else if (qopt->limit > 0) {
443 		child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit,
444 					 extack);
445 		if (IS_ERR(child)) {
446 			err = PTR_ERR(child);
447 			goto done;
448 		}
449 
450 		/* child is fifo, no need to check for noop_qdisc */
451 		qdisc_hash_add(child, true);
452 	}
453 
454 	sch_tree_lock(sch);
455 	if (child) {
456 		qdisc_purge_queue(q->qdisc);
457 		old = q->qdisc;
458 		q->qdisc = child;
459 	}
460 	q->limit = qopt->limit;
461 	if (tb[TCA_TBF_PBURST])
462 		q->mtu = mtu;
463 	else
464 		q->mtu = PSCHED_TICKS2NS(qopt->mtu);
465 	q->max_size = max_size;
466 	if (tb[TCA_TBF_BURST])
467 		q->buffer = buffer;
468 	else
469 		q->buffer = PSCHED_TICKS2NS(qopt->buffer);
470 	q->tokens = q->buffer;
471 	q->ptokens = q->mtu;
472 
473 	memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
474 	memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
475 
476 	sch_tree_unlock(sch);
477 	qdisc_put(old);
478 	err = 0;
479 
480 	tbf_offload_change(sch);
481 done:
482 	return err;
483 }
484 
485 static int tbf_init(struct Qdisc *sch, struct nlattr *opt,
486 		    struct netlink_ext_ack *extack)
487 {
488 	struct tbf_sched_data *q = qdisc_priv(sch);
489 
490 	qdisc_watchdog_init(&q->watchdog, sch);
491 	q->qdisc = &noop_qdisc;
492 
493 	if (!opt)
494 		return -EINVAL;
495 
496 	q->t_c = ktime_get_ns();
497 
498 	return tbf_change(sch, opt, extack);
499 }
500 
501 static void tbf_destroy(struct Qdisc *sch)
502 {
503 	struct tbf_sched_data *q = qdisc_priv(sch);
504 
505 	qdisc_watchdog_cancel(&q->watchdog);
506 	tbf_offload_destroy(sch);
507 	qdisc_put(q->qdisc);
508 }
509 
510 static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
511 {
512 	struct tbf_sched_data *q = qdisc_priv(sch);
513 	struct nlattr *nest;
514 	struct tc_tbf_qopt opt;
515 	int err;
516 
517 	err = tbf_offload_dump(sch);
518 	if (err)
519 		return err;
520 
521 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
522 	if (nest == NULL)
523 		goto nla_put_failure;
524 
525 	opt.limit = q->limit;
526 	psched_ratecfg_getrate(&opt.rate, &q->rate);
527 	if (tbf_peak_present(q))
528 		psched_ratecfg_getrate(&opt.peakrate, &q->peak);
529 	else
530 		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
531 	opt.mtu = PSCHED_NS2TICKS(q->mtu);
532 	opt.buffer = PSCHED_NS2TICKS(q->buffer);
533 	if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
534 		goto nla_put_failure;
535 	if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
536 	    nla_put_u64_64bit(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps,
537 			      TCA_TBF_PAD))
538 		goto nla_put_failure;
539 	if (tbf_peak_present(q) &&
540 	    q->peak.rate_bytes_ps >= (1ULL << 32) &&
541 	    nla_put_u64_64bit(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps,
542 			      TCA_TBF_PAD))
543 		goto nla_put_failure;
544 
545 	return nla_nest_end(skb, nest);
546 
547 nla_put_failure:
548 	nla_nest_cancel(skb, nest);
549 	return -1;
550 }
551 
552 static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
553 			  struct sk_buff *skb, struct tcmsg *tcm)
554 {
555 	struct tbf_sched_data *q = qdisc_priv(sch);
556 
557 	tcm->tcm_handle |= TC_H_MIN(1);
558 	tcm->tcm_info = q->qdisc->handle;
559 
560 	return 0;
561 }
562 
563 static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
564 		     struct Qdisc **old, struct netlink_ext_ack *extack)
565 {
566 	struct tbf_sched_data *q = qdisc_priv(sch);
567 
568 	if (new == NULL)
569 		new = &noop_qdisc;
570 
571 	*old = qdisc_replace(sch, new, &q->qdisc);
572 
573 	tbf_offload_graft(sch, new, *old, extack);
574 	return 0;
575 }
576 
577 static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
578 {
579 	struct tbf_sched_data *q = qdisc_priv(sch);
580 	return q->qdisc;
581 }
582 
583 static unsigned long tbf_find(struct Qdisc *sch, u32 classid)
584 {
585 	return 1;
586 }
587 
588 static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
589 {
590 	if (!walker->stop) {
591 		tc_qdisc_stats_dump(sch, 1, walker);
592 	}
593 }
594 
595 static const struct Qdisc_class_ops tbf_class_ops = {
596 	.graft		=	tbf_graft,
597 	.leaf		=	tbf_leaf,
598 	.find		=	tbf_find,
599 	.walk		=	tbf_walk,
600 	.dump		=	tbf_dump_class,
601 };
602 
603 static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
604 	.next		=	NULL,
605 	.cl_ops		=	&tbf_class_ops,
606 	.id		=	"tbf",
607 	.priv_size	=	sizeof(struct tbf_sched_data),
608 	.enqueue	=	tbf_enqueue,
609 	.dequeue	=	tbf_dequeue,
610 	.peek		=	qdisc_peek_dequeued,
611 	.init		=	tbf_init,
612 	.reset		=	tbf_reset,
613 	.destroy	=	tbf_destroy,
614 	.change		=	tbf_change,
615 	.dump		=	tbf_dump,
616 	.owner		=	THIS_MODULE,
617 };
618 MODULE_ALIAS_NET_SCH("tbf");
619 
620 static int __init tbf_module_init(void)
621 {
622 	return register_qdisc(&tbf_qdisc_ops);
623 }
624 
625 static void __exit tbf_module_exit(void)
626 {
627 	unregister_qdisc(&tbf_qdisc_ops);
628 }
629 module_init(tbf_module_init)
630 module_exit(tbf_module_exit)
631 MODULE_LICENSE("GPL");
632 MODULE_DESCRIPTION("Token Bucket Filter qdisc");
633