1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * net/sched/sch_tbf.c Token Bucket Filter queue. 4 * 5 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 6 * Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs - 7 * original idea by Martin Devera 8 */ 9 10 #include <linux/module.h> 11 #include <linux/types.h> 12 #include <linux/kernel.h> 13 #include <linux/string.h> 14 #include <linux/errno.h> 15 #include <linux/skbuff.h> 16 #include <net/gso.h> 17 #include <net/netlink.h> 18 #include <net/sch_generic.h> 19 #include <net/pkt_cls.h> 20 #include <net/pkt_sched.h> 21 22 23 /* Simple Token Bucket Filter. 24 ======================================= 25 26 SOURCE. 27 ------- 28 29 None. 30 31 Description. 32 ------------ 33 34 A data flow obeys TBF with rate R and depth B, if for any 35 time interval t_i...t_f the number of transmitted bits 36 does not exceed B + R*(t_f-t_i). 37 38 Packetized version of this definition: 39 The sequence of packets of sizes s_i served at moments t_i 40 obeys TBF, if for any i<=k: 41 42 s_i+....+s_k <= B + R*(t_k - t_i) 43 44 Algorithm. 45 ---------- 46 47 Let N(t_i) be B/R initially and N(t) grow continuously with time as: 48 49 N(t+delta) = min{B/R, N(t) + delta} 50 51 If the first packet in queue has length S, it may be 52 transmitted only at the time t_* when S/R <= N(t_*), 53 and in this case N(t) jumps: 54 55 N(t_* + 0) = N(t_* - 0) - S/R. 56 57 58 59 Actually, QoS requires two TBF to be applied to a data stream. 60 One of them controls steady state burst size, another 61 one with rate P (peak rate) and depth M (equal to link MTU) 62 limits bursts at a smaller time scale. 63 64 It is easy to see that P>R, and B>M. If P is infinity, this double 65 TBF is equivalent to a single one. 66 67 When TBF works in reshaping mode, latency is estimated as: 68 69 lat = max ((L-B)/R, (L-M)/P) 70 71 72 NOTES. 73 ------ 74 75 If TBF throttles, it starts a watchdog timer, which will wake it up 76 when it is ready to transmit. 77 Note that the minimal timer resolution is 1/HZ. 78 If no new packets arrive during this period, 79 or if the device is not awaken by EOI for some previous packet, 80 TBF can stop its activity for 1/HZ. 81 82 83 This means, that with depth B, the maximal rate is 84 85 R_crit = B*HZ 86 87 F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes. 88 89 Note that the peak rate TBF is much more tough: with MTU 1500 90 P_crit = 150Kbytes/sec. So, if you need greater peak 91 rates, use alpha with HZ=1000 :-) 92 93 With classful TBF, limit is just kept for backwards compatibility. 94 It is passed to the default bfifo qdisc - if the inner qdisc is 95 changed the limit is not effective anymore. 96 */ 97 98 struct tbf_sched_data { 99 /* Parameters */ 100 u32 limit; /* Maximal length of backlog: bytes */ 101 u32 max_size; 102 s64 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */ 103 s64 mtu; 104 struct psched_ratecfg rate; 105 struct psched_ratecfg peak; 106 107 /* Variables */ 108 s64 tokens; /* Current number of B tokens */ 109 s64 ptokens; /* Current number of P tokens */ 110 s64 t_c; /* Time check-point */ 111 struct Qdisc *qdisc; /* Inner qdisc, default - bfifo queue */ 112 struct qdisc_watchdog watchdog; /* Watchdog timer */ 113 }; 114 115 116 /* Time to Length, convert time in ns to length in bytes 117 * to determinate how many bytes can be sent in given time. 118 */ 119 static u64 psched_ns_t2l(const struct psched_ratecfg *r, 120 u64 time_in_ns) 121 { 122 /* The formula is : 123 * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC 124 */ 125 u64 len = time_in_ns * r->rate_bytes_ps; 126 127 do_div(len, NSEC_PER_SEC); 128 129 if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) { 130 do_div(len, 53); 131 len = len * 48; 132 } 133 134 if (len > r->overhead) 135 len -= r->overhead; 136 else 137 len = 0; 138 139 return len; 140 } 141 142 static void tbf_offload_change(struct Qdisc *sch, 143 struct netlink_ext_ack *extack) 144 { 145 struct tbf_sched_data *q = qdisc_priv(sch); 146 struct net_device *dev = qdisc_dev(sch); 147 struct tc_tbf_qopt_offload qopt; 148 149 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) 150 return; 151 152 qopt.extack = extack; 153 qopt.command = TC_TBF_REPLACE; 154 qopt.handle = sch->handle; 155 qopt.parent = sch->parent; 156 qopt.replace_params.rate = q->rate; 157 qopt.replace_params.max_size = q->max_size; 158 qopt.replace_params.qstats = &sch->qstats; 159 160 dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt); 161 } 162 163 static void tbf_offload_destroy(struct Qdisc *sch) 164 { 165 struct net_device *dev = qdisc_dev(sch); 166 struct tc_tbf_qopt_offload qopt; 167 168 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) 169 return; 170 171 qopt.extack = NULL; 172 qopt.command = TC_TBF_DESTROY; 173 qopt.handle = sch->handle; 174 qopt.parent = sch->parent; 175 dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt); 176 } 177 178 static int tbf_offload_dump(struct Qdisc *sch) 179 { 180 struct tc_tbf_qopt_offload qopt; 181 182 qopt.extack = NULL; 183 qopt.command = TC_TBF_STATS; 184 qopt.handle = sch->handle; 185 qopt.parent = sch->parent; 186 qopt.stats.bstats = &sch->bstats; 187 qopt.stats.qstats = &sch->qstats; 188 189 return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_TBF, &qopt); 190 } 191 192 static void tbf_offload_graft(struct Qdisc *sch, struct Qdisc *new, 193 struct Qdisc *old, struct netlink_ext_ack *extack) 194 { 195 struct tc_tbf_qopt_offload graft_offload = { 196 .handle = sch->handle, 197 .parent = sch->parent, 198 .child_handle = new->handle, 199 .command = TC_TBF_GRAFT, 200 .extack = extack, 201 }; 202 203 qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, old, 204 TC_SETUP_QDISC_TBF, &graft_offload, extack); 205 } 206 207 /* GSO packet is too big, segment it so that tbf can transmit 208 * each segment in time 209 */ 210 static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch, 211 struct sk_buff **to_free) 212 { 213 struct tbf_sched_data *q = qdisc_priv(sch); 214 struct sk_buff *segs, *nskb; 215 netdev_features_t features = netif_skb_features(skb); 216 unsigned int len = 0, prev_len = qdisc_pkt_len(skb), seg_len; 217 int ret, nb; 218 219 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 220 221 if (IS_ERR_OR_NULL(segs)) 222 return qdisc_drop(skb, sch, to_free); 223 224 nb = 0; 225 skb_list_walk_safe(segs, segs, nskb) { 226 skb_mark_not_on_list(segs); 227 seg_len = segs->len; 228 qdisc_skb_cb(segs)->pkt_len = seg_len; 229 qdisc_skb_cb(segs)->pkt_segs = 1; 230 ret = qdisc_enqueue(segs, q->qdisc, to_free); 231 if (ret != NET_XMIT_SUCCESS) { 232 if (net_xmit_drop_count(ret)) 233 qdisc_qstats_drop(sch); 234 } else { 235 nb++; 236 len += seg_len; 237 } 238 } 239 WRITE_ONCE(sch->q.qlen, sch->q.qlen + nb); 240 qstats_backlog_add(sch, len); 241 if (nb > 0) { 242 qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len); 243 consume_skb(skb); 244 return NET_XMIT_SUCCESS; 245 } 246 247 kfree_skb(skb); 248 return NET_XMIT_DROP; 249 } 250 251 static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch, 252 struct sk_buff **to_free) 253 { 254 struct tbf_sched_data *q = qdisc_priv(sch); 255 unsigned int len = qdisc_pkt_len(skb); 256 int ret; 257 258 if (qdisc_pkt_len(skb) > q->max_size) { 259 if (skb_is_gso(skb) && 260 skb_gso_validate_mac_len(skb, q->max_size)) 261 return tbf_segment(skb, sch, to_free); 262 return qdisc_drop(skb, sch, to_free); 263 } 264 ret = qdisc_enqueue(skb, q->qdisc, to_free); 265 if (ret != NET_XMIT_SUCCESS) { 266 if (net_xmit_drop_count(ret)) 267 qdisc_qstats_drop(sch); 268 return ret; 269 } 270 271 qstats_backlog_add(sch, len); 272 qdisc_qlen_inc(sch); 273 return NET_XMIT_SUCCESS; 274 } 275 276 static bool tbf_peak_present(const struct tbf_sched_data *q) 277 { 278 return q->peak.rate_bytes_ps; 279 } 280 281 static struct sk_buff *tbf_dequeue(struct Qdisc *sch) 282 { 283 struct tbf_sched_data *q = qdisc_priv(sch); 284 struct sk_buff *skb; 285 286 skb = q->qdisc->ops->peek(q->qdisc); 287 288 if (skb) { 289 s64 now; 290 s64 toks; 291 s64 ptoks = 0; 292 unsigned int len = qdisc_pkt_len(skb); 293 294 now = ktime_get_ns(); 295 toks = min_t(s64, now - q->t_c, q->buffer); 296 297 if (tbf_peak_present(q)) { 298 ptoks = toks + q->ptokens; 299 if (ptoks > q->mtu) 300 ptoks = q->mtu; 301 ptoks -= (s64) psched_l2t_ns(&q->peak, len); 302 } 303 toks += q->tokens; 304 if (toks > q->buffer) 305 toks = q->buffer; 306 toks -= (s64) psched_l2t_ns(&q->rate, len); 307 308 if ((toks|ptoks) >= 0) { 309 skb = qdisc_dequeue_peeked(q->qdisc); 310 if (unlikely(!skb)) 311 return NULL; 312 313 q->t_c = now; 314 q->tokens = toks; 315 q->ptokens = ptoks; 316 qdisc_qstats_backlog_dec(sch, skb); 317 qdisc_qlen_dec(sch); 318 qdisc_bstats_update(sch, skb); 319 return skb; 320 } 321 322 qdisc_watchdog_schedule_ns(&q->watchdog, 323 now + max_t(long, -toks, -ptoks)); 324 325 /* Maybe we have a shorter packet in the queue, 326 which can be sent now. It sounds cool, 327 but, however, this is wrong in principle. 328 We MUST NOT reorder packets under these circumstances. 329 330 Really, if we split the flow into independent 331 subflows, it would be a very good solution. 332 This is the main idea of all FQ algorithms 333 (cf. CSZ, HPFQ, HFSC) 334 */ 335 336 qdisc_qstats_overlimit(sch); 337 } 338 return NULL; 339 } 340 341 static void tbf_reset(struct Qdisc *sch) 342 { 343 struct tbf_sched_data *q = qdisc_priv(sch); 344 345 qdisc_reset(q->qdisc); 346 q->t_c = ktime_get_ns(); 347 q->tokens = q->buffer; 348 q->ptokens = q->mtu; 349 qdisc_watchdog_cancel(&q->watchdog); 350 } 351 352 static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = { 353 [TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) }, 354 [TCA_TBF_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, 355 [TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, 356 [TCA_TBF_RATE64] = { .type = NLA_U64 }, 357 [TCA_TBF_PRATE64] = { .type = NLA_U64 }, 358 [TCA_TBF_BURST] = { .type = NLA_U32 }, 359 [TCA_TBF_PBURST] = { .type = NLA_U32 }, 360 }; 361 362 static int tbf_change(struct Qdisc *sch, struct nlattr *opt, 363 struct netlink_ext_ack *extack) 364 { 365 int err; 366 struct tbf_sched_data *q = qdisc_priv(sch); 367 struct nlattr *tb[TCA_TBF_MAX + 1]; 368 struct tc_tbf_qopt *qopt; 369 struct Qdisc *child = NULL; 370 struct Qdisc *old = NULL; 371 struct psched_ratecfg rate; 372 struct psched_ratecfg peak; 373 u64 max_size; 374 s64 buffer, mtu; 375 u64 rate64 = 0, prate64 = 0; 376 377 err = nla_parse_nested_deprecated(tb, TCA_TBF_MAX, opt, tbf_policy, 378 NULL); 379 if (err < 0) 380 return err; 381 382 err = -EINVAL; 383 if (tb[TCA_TBF_PARMS] == NULL) 384 goto done; 385 386 qopt = nla_data(tb[TCA_TBF_PARMS]); 387 if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE) 388 qdisc_put_rtab(qdisc_get_rtab(&qopt->rate, 389 tb[TCA_TBF_RTAB], 390 NULL)); 391 392 if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE) 393 qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate, 394 tb[TCA_TBF_PTAB], 395 NULL)); 396 397 buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U); 398 mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U); 399 400 if (tb[TCA_TBF_RATE64]) 401 rate64 = nla_get_u64(tb[TCA_TBF_RATE64]); 402 psched_ratecfg_precompute(&rate, &qopt->rate, rate64); 403 404 if (tb[TCA_TBF_BURST]) { 405 max_size = nla_get_u32(tb[TCA_TBF_BURST]); 406 buffer = psched_l2t_ns(&rate, max_size); 407 } else { 408 max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U); 409 } 410 411 if (qopt->peakrate.rate) { 412 if (tb[TCA_TBF_PRATE64]) 413 prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]); 414 psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64); 415 if (peak.rate_bytes_ps <= rate.rate_bytes_ps) { 416 pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n", 417 peak.rate_bytes_ps, rate.rate_bytes_ps); 418 err = -EINVAL; 419 goto done; 420 } 421 422 if (tb[TCA_TBF_PBURST]) { 423 u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]); 424 max_size = min_t(u32, max_size, pburst); 425 mtu = psched_l2t_ns(&peak, pburst); 426 } else { 427 max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu)); 428 } 429 } else { 430 memset(&peak, 0, sizeof(peak)); 431 } 432 433 if (max_size < psched_mtu(qdisc_dev(sch))) 434 pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n", 435 max_size, qdisc_dev(sch)->name, 436 psched_mtu(qdisc_dev(sch))); 437 438 if (!max_size) { 439 err = -EINVAL; 440 goto done; 441 } 442 443 if (q->qdisc != &noop_qdisc) { 444 err = fifo_set_limit(q->qdisc, qopt->limit); 445 if (err) 446 goto done; 447 } else if (qopt->limit > 0) { 448 child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit, 449 extack); 450 if (IS_ERR(child)) { 451 err = PTR_ERR(child); 452 goto done; 453 } 454 455 /* child is fifo, no need to check for noop_qdisc */ 456 qdisc_hash_add(child, true); 457 } 458 459 sch_tree_lock(sch); 460 if (child) { 461 qdisc_purge_queue(q->qdisc); 462 old = q->qdisc; 463 q->qdisc = child; 464 } 465 q->limit = qopt->limit; 466 if (tb[TCA_TBF_PBURST]) 467 q->mtu = mtu; 468 else 469 q->mtu = PSCHED_TICKS2NS(qopt->mtu); 470 q->max_size = max_size; 471 if (tb[TCA_TBF_BURST]) 472 q->buffer = buffer; 473 else 474 q->buffer = PSCHED_TICKS2NS(qopt->buffer); 475 q->tokens = q->buffer; 476 q->ptokens = q->mtu; 477 478 memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg)); 479 memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg)); 480 481 sch_tree_unlock(sch); 482 qdisc_put(old); 483 err = 0; 484 485 tbf_offload_change(sch, extack); 486 done: 487 return err; 488 } 489 490 static int tbf_init(struct Qdisc *sch, struct nlattr *opt, 491 struct netlink_ext_ack *extack) 492 { 493 struct tbf_sched_data *q = qdisc_priv(sch); 494 495 qdisc_watchdog_init(&q->watchdog, sch); 496 q->qdisc = &noop_qdisc; 497 498 if (!opt) 499 return -EINVAL; 500 501 q->t_c = ktime_get_ns(); 502 503 return tbf_change(sch, opt, extack); 504 } 505 506 static void tbf_destroy(struct Qdisc *sch) 507 { 508 struct tbf_sched_data *q = qdisc_priv(sch); 509 510 qdisc_watchdog_cancel(&q->watchdog); 511 tbf_offload_destroy(sch); 512 qdisc_put(q->qdisc); 513 } 514 515 static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) 516 { 517 struct tbf_sched_data *q = qdisc_priv(sch); 518 struct nlattr *nest; 519 struct tc_tbf_qopt opt; 520 int err; 521 522 err = tbf_offload_dump(sch); 523 if (err) 524 return err; 525 526 nest = nla_nest_start_noflag(skb, TCA_OPTIONS); 527 if (nest == NULL) 528 goto nla_put_failure; 529 530 opt.limit = q->limit; 531 psched_ratecfg_getrate(&opt.rate, &q->rate); 532 if (tbf_peak_present(q)) 533 psched_ratecfg_getrate(&opt.peakrate, &q->peak); 534 else 535 memset(&opt.peakrate, 0, sizeof(opt.peakrate)); 536 opt.mtu = PSCHED_NS2TICKS(q->mtu); 537 opt.buffer = PSCHED_NS2TICKS(q->buffer); 538 if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt)) 539 goto nla_put_failure; 540 if (q->rate.rate_bytes_ps >= (1ULL << 32) && 541 nla_put_u64_64bit(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps, 542 TCA_TBF_PAD)) 543 goto nla_put_failure; 544 if (tbf_peak_present(q) && 545 q->peak.rate_bytes_ps >= (1ULL << 32) && 546 nla_put_u64_64bit(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps, 547 TCA_TBF_PAD)) 548 goto nla_put_failure; 549 550 return nla_nest_end(skb, nest); 551 552 nla_put_failure: 553 nla_nest_cancel(skb, nest); 554 return -1; 555 } 556 557 static int tbf_dump_class(struct Qdisc *sch, unsigned long cl, 558 struct sk_buff *skb, struct tcmsg *tcm) 559 { 560 struct tbf_sched_data *q = qdisc_priv(sch); 561 562 tcm->tcm_handle |= TC_H_MIN(1); 563 tcm->tcm_info = q->qdisc->handle; 564 565 return 0; 566 } 567 568 static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, 569 struct Qdisc **old, struct netlink_ext_ack *extack) 570 { 571 struct tbf_sched_data *q = qdisc_priv(sch); 572 573 if (new == NULL) 574 new = &noop_qdisc; 575 576 *old = qdisc_replace(sch, new, &q->qdisc); 577 578 tbf_offload_graft(sch, new, *old, extack); 579 return 0; 580 } 581 582 static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg) 583 { 584 struct tbf_sched_data *q = qdisc_priv(sch); 585 return q->qdisc; 586 } 587 588 static unsigned long tbf_find(struct Qdisc *sch, u32 classid) 589 { 590 return 1; 591 } 592 593 static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker) 594 { 595 if (!walker->stop) { 596 tc_qdisc_stats_dump(sch, 1, walker); 597 } 598 } 599 600 static const struct Qdisc_class_ops tbf_class_ops = { 601 .graft = tbf_graft, 602 .leaf = tbf_leaf, 603 .find = tbf_find, 604 .walk = tbf_walk, 605 .dump = tbf_dump_class, 606 }; 607 608 static struct Qdisc_ops tbf_qdisc_ops __read_mostly = { 609 .next = NULL, 610 .cl_ops = &tbf_class_ops, 611 .id = "tbf", 612 .priv_size = sizeof(struct tbf_sched_data), 613 .enqueue = tbf_enqueue, 614 .dequeue = tbf_dequeue, 615 .peek = qdisc_peek_dequeued, 616 .init = tbf_init, 617 .reset = tbf_reset, 618 .destroy = tbf_destroy, 619 .change = tbf_change, 620 .dump = tbf_dump, 621 .owner = THIS_MODULE, 622 }; 623 MODULE_ALIAS_NET_SCH("tbf"); 624 625 static int __init tbf_module_init(void) 626 { 627 return register_qdisc(&tbf_qdisc_ops); 628 } 629 630 static void __exit tbf_module_exit(void) 631 { 632 unregister_qdisc(&tbf_qdisc_ops); 633 } 634 module_init(tbf_module_init) 635 module_exit(tbf_module_exit) 636 MODULE_LICENSE("GPL"); 637 MODULE_DESCRIPTION("Token Bucket Filter qdisc"); 638