1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * net/sched/sch_tbf.c Token Bucket Filter queue. 4 * 5 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 6 * Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs - 7 * original idea by Martin Devera 8 */ 9 10 #include <linux/module.h> 11 #include <linux/types.h> 12 #include <linux/kernel.h> 13 #include <linux/string.h> 14 #include <linux/errno.h> 15 #include <linux/skbuff.h> 16 #include <net/gso.h> 17 #include <net/netlink.h> 18 #include <net/sch_generic.h> 19 #include <net/pkt_cls.h> 20 #include <net/pkt_sched.h> 21 22 23 /* Simple Token Bucket Filter. 24 ======================================= 25 26 SOURCE. 27 ------- 28 29 None. 30 31 Description. 32 ------------ 33 34 A data flow obeys TBF with rate R and depth B, if for any 35 time interval t_i...t_f the number of transmitted bits 36 does not exceed B + R*(t_f-t_i). 37 38 Packetized version of this definition: 39 The sequence of packets of sizes s_i served at moments t_i 40 obeys TBF, if for any i<=k: 41 42 s_i+....+s_k <= B + R*(t_k - t_i) 43 44 Algorithm. 45 ---------- 46 47 Let N(t_i) be B/R initially and N(t) grow continuously with time as: 48 49 N(t+delta) = min{B/R, N(t) + delta} 50 51 If the first packet in queue has length S, it may be 52 transmitted only at the time t_* when S/R <= N(t_*), 53 and in this case N(t) jumps: 54 55 N(t_* + 0) = N(t_* - 0) - S/R. 56 57 58 59 Actually, QoS requires two TBF to be applied to a data stream. 60 One of them controls steady state burst size, another 61 one with rate P (peak rate) and depth M (equal to link MTU) 62 limits bursts at a smaller time scale. 63 64 It is easy to see that P>R, and B>M. If P is infinity, this double 65 TBF is equivalent to a single one. 66 67 When TBF works in reshaping mode, latency is estimated as: 68 69 lat = max ((L-B)/R, (L-M)/P) 70 71 72 NOTES. 73 ------ 74 75 If TBF throttles, it starts a watchdog timer, which will wake it up 76 when it is ready to transmit. 77 Note that the minimal timer resolution is 1/HZ. 78 If no new packets arrive during this period, 79 or if the device is not awaken by EOI for some previous packet, 80 TBF can stop its activity for 1/HZ. 81 82 83 This means, that with depth B, the maximal rate is 84 85 R_crit = B*HZ 86 87 F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes. 88 89 Note that the peak rate TBF is much more tough: with MTU 1500 90 P_crit = 150Kbytes/sec. So, if you need greater peak 91 rates, use alpha with HZ=1000 :-) 92 93 With classful TBF, limit is just kept for backwards compatibility. 94 It is passed to the default bfifo qdisc - if the inner qdisc is 95 changed the limit is not effective anymore. 96 */ 97 98 struct tbf_sched_data { 99 /* Parameters */ 100 u32 limit; /* Maximal length of backlog: bytes */ 101 u32 max_size; 102 s64 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */ 103 s64 mtu; 104 struct psched_ratecfg rate; 105 struct psched_ratecfg peak; 106 107 /* Variables */ 108 s64 tokens; /* Current number of B tokens */ 109 s64 ptokens; /* Current number of P tokens */ 110 s64 t_c; /* Time check-point */ 111 struct Qdisc *qdisc; /* Inner qdisc, default - bfifo queue */ 112 struct qdisc_watchdog watchdog; /* Watchdog timer */ 113 }; 114 115 116 /* Time to Length, convert time in ns to length in bytes 117 * to determinate how many bytes can be sent in given time. 118 */ 119 static u64 psched_ns_t2l(const struct psched_ratecfg *r, 120 u64 time_in_ns) 121 { 122 /* The formula is : 123 * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC 124 */ 125 u64 len = time_in_ns * r->rate_bytes_ps; 126 127 do_div(len, NSEC_PER_SEC); 128 129 if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) { 130 do_div(len, 53); 131 len = len * 48; 132 } 133 134 if (len > r->overhead) 135 len -= r->overhead; 136 else 137 len = 0; 138 139 return len; 140 } 141 142 static void tbf_offload_change(struct Qdisc *sch) 143 { 144 struct tbf_sched_data *q = qdisc_priv(sch); 145 struct net_device *dev = qdisc_dev(sch); 146 struct tc_tbf_qopt_offload qopt; 147 148 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) 149 return; 150 151 qopt.command = TC_TBF_REPLACE; 152 qopt.handle = sch->handle; 153 qopt.parent = sch->parent; 154 qopt.replace_params.rate = q->rate; 155 qopt.replace_params.max_size = q->max_size; 156 qopt.replace_params.qstats = &sch->qstats; 157 158 dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt); 159 } 160 161 static void tbf_offload_destroy(struct Qdisc *sch) 162 { 163 struct net_device *dev = qdisc_dev(sch); 164 struct tc_tbf_qopt_offload qopt; 165 166 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) 167 return; 168 169 qopt.command = TC_TBF_DESTROY; 170 qopt.handle = sch->handle; 171 qopt.parent = sch->parent; 172 dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt); 173 } 174 175 static int tbf_offload_dump(struct Qdisc *sch) 176 { 177 struct tc_tbf_qopt_offload qopt; 178 179 qopt.command = TC_TBF_STATS; 180 qopt.handle = sch->handle; 181 qopt.parent = sch->parent; 182 qopt.stats.bstats = &sch->bstats; 183 qopt.stats.qstats = &sch->qstats; 184 185 return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_TBF, &qopt); 186 } 187 188 static void tbf_offload_graft(struct Qdisc *sch, struct Qdisc *new, 189 struct Qdisc *old, struct netlink_ext_ack *extack) 190 { 191 struct tc_tbf_qopt_offload graft_offload = { 192 .handle = sch->handle, 193 .parent = sch->parent, 194 .child_handle = new->handle, 195 .command = TC_TBF_GRAFT, 196 }; 197 198 qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, old, 199 TC_SETUP_QDISC_TBF, &graft_offload, extack); 200 } 201 202 /* GSO packet is too big, segment it so that tbf can transmit 203 * each segment in time 204 */ 205 static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch, 206 struct sk_buff **to_free) 207 { 208 struct tbf_sched_data *q = qdisc_priv(sch); 209 struct sk_buff *segs, *nskb; 210 netdev_features_t features = netif_skb_features(skb); 211 unsigned int len = 0, prev_len = qdisc_pkt_len(skb), seg_len; 212 int ret, nb; 213 214 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 215 216 if (IS_ERR_OR_NULL(segs)) 217 return qdisc_drop(skb, sch, to_free); 218 219 nb = 0; 220 skb_list_walk_safe(segs, segs, nskb) { 221 skb_mark_not_on_list(segs); 222 seg_len = segs->len; 223 qdisc_skb_cb(segs)->pkt_len = seg_len; 224 qdisc_skb_cb(segs)->pkt_segs = 1; 225 ret = qdisc_enqueue(segs, q->qdisc, to_free); 226 if (ret != NET_XMIT_SUCCESS) { 227 if (net_xmit_drop_count(ret)) 228 qdisc_qstats_drop(sch); 229 } else { 230 nb++; 231 len += seg_len; 232 } 233 } 234 sch->q.qlen += nb; 235 sch->qstats.backlog += len; 236 if (nb > 0) { 237 qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len); 238 consume_skb(skb); 239 return NET_XMIT_SUCCESS; 240 } 241 242 kfree_skb(skb); 243 return NET_XMIT_DROP; 244 } 245 246 static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch, 247 struct sk_buff **to_free) 248 { 249 struct tbf_sched_data *q = qdisc_priv(sch); 250 unsigned int len = qdisc_pkt_len(skb); 251 int ret; 252 253 if (qdisc_pkt_len(skb) > q->max_size) { 254 if (skb_is_gso(skb) && 255 skb_gso_validate_mac_len(skb, q->max_size)) 256 return tbf_segment(skb, sch, to_free); 257 return qdisc_drop(skb, sch, to_free); 258 } 259 ret = qdisc_enqueue(skb, q->qdisc, to_free); 260 if (ret != NET_XMIT_SUCCESS) { 261 if (net_xmit_drop_count(ret)) 262 qdisc_qstats_drop(sch); 263 return ret; 264 } 265 266 sch->qstats.backlog += len; 267 sch->q.qlen++; 268 return NET_XMIT_SUCCESS; 269 } 270 271 static bool tbf_peak_present(const struct tbf_sched_data *q) 272 { 273 return q->peak.rate_bytes_ps; 274 } 275 276 static struct sk_buff *tbf_dequeue(struct Qdisc *sch) 277 { 278 struct tbf_sched_data *q = qdisc_priv(sch); 279 struct sk_buff *skb; 280 281 skb = q->qdisc->ops->peek(q->qdisc); 282 283 if (skb) { 284 s64 now; 285 s64 toks; 286 s64 ptoks = 0; 287 unsigned int len = qdisc_pkt_len(skb); 288 289 now = ktime_get_ns(); 290 toks = min_t(s64, now - q->t_c, q->buffer); 291 292 if (tbf_peak_present(q)) { 293 ptoks = toks + q->ptokens; 294 if (ptoks > q->mtu) 295 ptoks = q->mtu; 296 ptoks -= (s64) psched_l2t_ns(&q->peak, len); 297 } 298 toks += q->tokens; 299 if (toks > q->buffer) 300 toks = q->buffer; 301 toks -= (s64) psched_l2t_ns(&q->rate, len); 302 303 if ((toks|ptoks) >= 0) { 304 skb = qdisc_dequeue_peeked(q->qdisc); 305 if (unlikely(!skb)) 306 return NULL; 307 308 q->t_c = now; 309 q->tokens = toks; 310 q->ptokens = ptoks; 311 qdisc_qstats_backlog_dec(sch, skb); 312 sch->q.qlen--; 313 qdisc_bstats_update(sch, skb); 314 return skb; 315 } 316 317 qdisc_watchdog_schedule_ns(&q->watchdog, 318 now + max_t(long, -toks, -ptoks)); 319 320 /* Maybe we have a shorter packet in the queue, 321 which can be sent now. It sounds cool, 322 but, however, this is wrong in principle. 323 We MUST NOT reorder packets under these circumstances. 324 325 Really, if we split the flow into independent 326 subflows, it would be a very good solution. 327 This is the main idea of all FQ algorithms 328 (cf. CSZ, HPFQ, HFSC) 329 */ 330 331 qdisc_qstats_overlimit(sch); 332 } 333 return NULL; 334 } 335 336 static void tbf_reset(struct Qdisc *sch) 337 { 338 struct tbf_sched_data *q = qdisc_priv(sch); 339 340 qdisc_reset(q->qdisc); 341 q->t_c = ktime_get_ns(); 342 q->tokens = q->buffer; 343 q->ptokens = q->mtu; 344 qdisc_watchdog_cancel(&q->watchdog); 345 } 346 347 static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = { 348 [TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) }, 349 [TCA_TBF_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, 350 [TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, 351 [TCA_TBF_RATE64] = { .type = NLA_U64 }, 352 [TCA_TBF_PRATE64] = { .type = NLA_U64 }, 353 [TCA_TBF_BURST] = { .type = NLA_U32 }, 354 [TCA_TBF_PBURST] = { .type = NLA_U32 }, 355 }; 356 357 static int tbf_change(struct Qdisc *sch, struct nlattr *opt, 358 struct netlink_ext_ack *extack) 359 { 360 int err; 361 struct tbf_sched_data *q = qdisc_priv(sch); 362 struct nlattr *tb[TCA_TBF_MAX + 1]; 363 struct tc_tbf_qopt *qopt; 364 struct Qdisc *child = NULL; 365 struct Qdisc *old = NULL; 366 struct psched_ratecfg rate; 367 struct psched_ratecfg peak; 368 u64 max_size; 369 s64 buffer, mtu; 370 u64 rate64 = 0, prate64 = 0; 371 372 err = nla_parse_nested_deprecated(tb, TCA_TBF_MAX, opt, tbf_policy, 373 NULL); 374 if (err < 0) 375 return err; 376 377 err = -EINVAL; 378 if (tb[TCA_TBF_PARMS] == NULL) 379 goto done; 380 381 qopt = nla_data(tb[TCA_TBF_PARMS]); 382 if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE) 383 qdisc_put_rtab(qdisc_get_rtab(&qopt->rate, 384 tb[TCA_TBF_RTAB], 385 NULL)); 386 387 if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE) 388 qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate, 389 tb[TCA_TBF_PTAB], 390 NULL)); 391 392 buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U); 393 mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U); 394 395 if (tb[TCA_TBF_RATE64]) 396 rate64 = nla_get_u64(tb[TCA_TBF_RATE64]); 397 psched_ratecfg_precompute(&rate, &qopt->rate, rate64); 398 399 if (tb[TCA_TBF_BURST]) { 400 max_size = nla_get_u32(tb[TCA_TBF_BURST]); 401 buffer = psched_l2t_ns(&rate, max_size); 402 } else { 403 max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U); 404 } 405 406 if (qopt->peakrate.rate) { 407 if (tb[TCA_TBF_PRATE64]) 408 prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]); 409 psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64); 410 if (peak.rate_bytes_ps <= rate.rate_bytes_ps) { 411 pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n", 412 peak.rate_bytes_ps, rate.rate_bytes_ps); 413 err = -EINVAL; 414 goto done; 415 } 416 417 if (tb[TCA_TBF_PBURST]) { 418 u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]); 419 max_size = min_t(u32, max_size, pburst); 420 mtu = psched_l2t_ns(&peak, pburst); 421 } else { 422 max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu)); 423 } 424 } else { 425 memset(&peak, 0, sizeof(peak)); 426 } 427 428 if (max_size < psched_mtu(qdisc_dev(sch))) 429 pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n", 430 max_size, qdisc_dev(sch)->name, 431 psched_mtu(qdisc_dev(sch))); 432 433 if (!max_size) { 434 err = -EINVAL; 435 goto done; 436 } 437 438 if (q->qdisc != &noop_qdisc) { 439 err = fifo_set_limit(q->qdisc, qopt->limit); 440 if (err) 441 goto done; 442 } else if (qopt->limit > 0) { 443 child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit, 444 extack); 445 if (IS_ERR(child)) { 446 err = PTR_ERR(child); 447 goto done; 448 } 449 450 /* child is fifo, no need to check for noop_qdisc */ 451 qdisc_hash_add(child, true); 452 } 453 454 sch_tree_lock(sch); 455 if (child) { 456 qdisc_purge_queue(q->qdisc); 457 old = q->qdisc; 458 q->qdisc = child; 459 } 460 q->limit = qopt->limit; 461 if (tb[TCA_TBF_PBURST]) 462 q->mtu = mtu; 463 else 464 q->mtu = PSCHED_TICKS2NS(qopt->mtu); 465 q->max_size = max_size; 466 if (tb[TCA_TBF_BURST]) 467 q->buffer = buffer; 468 else 469 q->buffer = PSCHED_TICKS2NS(qopt->buffer); 470 q->tokens = q->buffer; 471 q->ptokens = q->mtu; 472 473 memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg)); 474 memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg)); 475 476 sch_tree_unlock(sch); 477 qdisc_put(old); 478 err = 0; 479 480 tbf_offload_change(sch); 481 done: 482 return err; 483 } 484 485 static int tbf_init(struct Qdisc *sch, struct nlattr *opt, 486 struct netlink_ext_ack *extack) 487 { 488 struct tbf_sched_data *q = qdisc_priv(sch); 489 490 qdisc_watchdog_init(&q->watchdog, sch); 491 q->qdisc = &noop_qdisc; 492 493 if (!opt) 494 return -EINVAL; 495 496 q->t_c = ktime_get_ns(); 497 498 return tbf_change(sch, opt, extack); 499 } 500 501 static void tbf_destroy(struct Qdisc *sch) 502 { 503 struct tbf_sched_data *q = qdisc_priv(sch); 504 505 qdisc_watchdog_cancel(&q->watchdog); 506 tbf_offload_destroy(sch); 507 qdisc_put(q->qdisc); 508 } 509 510 static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) 511 { 512 struct tbf_sched_data *q = qdisc_priv(sch); 513 struct nlattr *nest; 514 struct tc_tbf_qopt opt; 515 int err; 516 517 err = tbf_offload_dump(sch); 518 if (err) 519 return err; 520 521 nest = nla_nest_start_noflag(skb, TCA_OPTIONS); 522 if (nest == NULL) 523 goto nla_put_failure; 524 525 opt.limit = q->limit; 526 psched_ratecfg_getrate(&opt.rate, &q->rate); 527 if (tbf_peak_present(q)) 528 psched_ratecfg_getrate(&opt.peakrate, &q->peak); 529 else 530 memset(&opt.peakrate, 0, sizeof(opt.peakrate)); 531 opt.mtu = PSCHED_NS2TICKS(q->mtu); 532 opt.buffer = PSCHED_NS2TICKS(q->buffer); 533 if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt)) 534 goto nla_put_failure; 535 if (q->rate.rate_bytes_ps >= (1ULL << 32) && 536 nla_put_u64_64bit(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps, 537 TCA_TBF_PAD)) 538 goto nla_put_failure; 539 if (tbf_peak_present(q) && 540 q->peak.rate_bytes_ps >= (1ULL << 32) && 541 nla_put_u64_64bit(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps, 542 TCA_TBF_PAD)) 543 goto nla_put_failure; 544 545 return nla_nest_end(skb, nest); 546 547 nla_put_failure: 548 nla_nest_cancel(skb, nest); 549 return -1; 550 } 551 552 static int tbf_dump_class(struct Qdisc *sch, unsigned long cl, 553 struct sk_buff *skb, struct tcmsg *tcm) 554 { 555 struct tbf_sched_data *q = qdisc_priv(sch); 556 557 tcm->tcm_handle |= TC_H_MIN(1); 558 tcm->tcm_info = q->qdisc->handle; 559 560 return 0; 561 } 562 563 static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, 564 struct Qdisc **old, struct netlink_ext_ack *extack) 565 { 566 struct tbf_sched_data *q = qdisc_priv(sch); 567 568 if (new == NULL) 569 new = &noop_qdisc; 570 571 *old = qdisc_replace(sch, new, &q->qdisc); 572 573 tbf_offload_graft(sch, new, *old, extack); 574 return 0; 575 } 576 577 static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg) 578 { 579 struct tbf_sched_data *q = qdisc_priv(sch); 580 return q->qdisc; 581 } 582 583 static unsigned long tbf_find(struct Qdisc *sch, u32 classid) 584 { 585 return 1; 586 } 587 588 static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker) 589 { 590 if (!walker->stop) { 591 tc_qdisc_stats_dump(sch, 1, walker); 592 } 593 } 594 595 static const struct Qdisc_class_ops tbf_class_ops = { 596 .graft = tbf_graft, 597 .leaf = tbf_leaf, 598 .find = tbf_find, 599 .walk = tbf_walk, 600 .dump = tbf_dump_class, 601 }; 602 603 static struct Qdisc_ops tbf_qdisc_ops __read_mostly = { 604 .next = NULL, 605 .cl_ops = &tbf_class_ops, 606 .id = "tbf", 607 .priv_size = sizeof(struct tbf_sched_data), 608 .enqueue = tbf_enqueue, 609 .dequeue = tbf_dequeue, 610 .peek = qdisc_peek_dequeued, 611 .init = tbf_init, 612 .reset = tbf_reset, 613 .destroy = tbf_destroy, 614 .change = tbf_change, 615 .dump = tbf_dump, 616 .owner = THIS_MODULE, 617 }; 618 MODULE_ALIAS_NET_SCH("tbf"); 619 620 static int __init tbf_module_init(void) 621 { 622 return register_qdisc(&tbf_qdisc_ops); 623 } 624 625 static void __exit tbf_module_exit(void) 626 { 627 unregister_qdisc(&tbf_qdisc_ops); 628 } 629 module_init(tbf_module_init) 630 module_exit(tbf_module_exit) 631 MODULE_LICENSE("GPL"); 632 MODULE_DESCRIPTION("Token Bucket Filter qdisc"); 633