1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * net/sched/sch_api.c Packet scheduler API. 4 * 5 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 6 * 7 * Fixes: 8 * 9 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired. 10 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support 11 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support 12 */ 13 14 #include <linux/module.h> 15 #include <linux/types.h> 16 #include <linux/kernel.h> 17 #include <linux/string.h> 18 #include <linux/errno.h> 19 #include <linux/skbuff.h> 20 #include <linux/init.h> 21 #include <linux/proc_fs.h> 22 #include <linux/seq_file.h> 23 #include <linux/kmod.h> 24 #include <linux/list.h> 25 #include <linux/hrtimer.h> 26 #include <linux/slab.h> 27 #include <linux/hashtable.h> 28 #include <linux/bpf.h> 29 30 #include <net/netdev_lock.h> 31 #include <net/net_namespace.h> 32 #include <net/sock.h> 33 #include <net/netlink.h> 34 #include <net/pkt_sched.h> 35 #include <net/pkt_cls.h> 36 #include <net/tc_wrapper.h> 37 38 #include <trace/events/qdisc.h> 39 40 /* 41 42 Short review. 43 ------------- 44 45 This file consists of two interrelated parts: 46 47 1. queueing disciplines manager frontend. 48 2. traffic classes manager frontend. 49 50 Generally, queueing discipline ("qdisc") is a black box, 51 which is able to enqueue packets and to dequeue them (when 52 device is ready to send something) in order and at times 53 determined by algorithm hidden in it. 54 55 qdisc's are divided to two categories: 56 - "queues", which have no internal structure visible from outside. 57 - "schedulers", which split all the packets to "traffic classes", 58 using "packet classifiers" (look at cls_api.c) 59 60 In turn, classes may have child qdiscs (as rule, queues) 61 attached to them etc. etc. etc. 62 63 The goal of the routines in this file is to translate 64 information supplied by user in the form of handles 65 to more intelligible for kernel form, to make some sanity 66 checks and part of work, which is common to all qdiscs 67 and to provide rtnetlink notifications. 68 69 All real intelligent work is done inside qdisc modules. 70 71 72 73 Every discipline has two major routines: enqueue and dequeue. 74 75 ---dequeue 76 77 dequeue usually returns a skb to send. It is allowed to return NULL, 78 but it does not mean that queue is empty, it just means that 79 discipline does not want to send anything this time. 80 Queue is really empty if q->q.qlen == 0. 81 For complicated disciplines with multiple queues q->q is not 82 real packet queue, but however q->q.qlen must be valid. 83 84 ---enqueue 85 86 enqueue returns 0, if packet was enqueued successfully. 87 If packet (this one or another one) was dropped, it returns 88 not zero error code. 89 NET_XMIT_DROP - this packet dropped 90 Expected action: do not backoff, but wait until queue will clear. 91 NET_XMIT_CN - probably this packet enqueued, but another one dropped. 92 Expected action: backoff or ignore 93 94 Auxiliary routines: 95 96 ---peek 97 98 like dequeue but without removing a packet from the queue 99 100 ---reset 101 102 returns qdisc to initial state: purge all buffers, clear all 103 timers, counters (except for statistics) etc. 104 105 ---init 106 107 initializes newly created qdisc. 108 109 ---destroy 110 111 destroys resources allocated by init and during lifetime of qdisc. 112 113 ---change 114 115 changes qdisc parameters. 116 */ 117 118 /* Protects list of registered TC modules. It is pure SMP lock. */ 119 static DEFINE_RWLOCK(qdisc_mod_lock); 120 121 122 /************************************************ 123 * Queueing disciplines manipulation. * 124 ************************************************/ 125 126 127 /* The list of all installed queueing disciplines. */ 128 129 static struct Qdisc_ops *qdisc_base; 130 131 /* Register/unregister queueing discipline */ 132 133 int register_qdisc(struct Qdisc_ops *qops) 134 { 135 struct Qdisc_ops *q, **qp; 136 int rc = -EEXIST; 137 138 write_lock(&qdisc_mod_lock); 139 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next) 140 if (!strcmp(qops->id, q->id)) 141 goto out; 142 143 if (qops->enqueue == NULL) 144 qops->enqueue = noop_qdisc_ops.enqueue; 145 if (qops->peek == NULL) { 146 if (qops->dequeue == NULL) 147 qops->peek = noop_qdisc_ops.peek; 148 else 149 goto out_einval; 150 } 151 if (qops->dequeue == NULL) 152 qops->dequeue = noop_qdisc_ops.dequeue; 153 154 if (qops->cl_ops) { 155 const struct Qdisc_class_ops *cops = qops->cl_ops; 156 157 if (!(cops->find && cops->walk && cops->leaf)) 158 goto out_einval; 159 160 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf)) 161 goto out_einval; 162 } 163 164 qops->next = NULL; 165 *qp = qops; 166 rc = 0; 167 out: 168 write_unlock(&qdisc_mod_lock); 169 return rc; 170 171 out_einval: 172 rc = -EINVAL; 173 goto out; 174 } 175 EXPORT_SYMBOL(register_qdisc); 176 177 void unregister_qdisc(struct Qdisc_ops *qops) 178 { 179 struct Qdisc_ops *q, **qp; 180 int err = -ENOENT; 181 182 write_lock(&qdisc_mod_lock); 183 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next) 184 if (q == qops) 185 break; 186 if (q) { 187 *qp = q->next; 188 q->next = NULL; 189 err = 0; 190 } 191 write_unlock(&qdisc_mod_lock); 192 193 WARN(err, "unregister qdisc(%s) failed\n", qops->id); 194 } 195 EXPORT_SYMBOL(unregister_qdisc); 196 197 /* Get default qdisc if not otherwise specified */ 198 void qdisc_get_default(char *name, size_t len) 199 { 200 read_lock(&qdisc_mod_lock); 201 strscpy(name, default_qdisc_ops->id, len); 202 read_unlock(&qdisc_mod_lock); 203 } 204 205 static struct Qdisc_ops *qdisc_lookup_default(const char *name) 206 { 207 struct Qdisc_ops *q = NULL; 208 209 for (q = qdisc_base; q; q = q->next) { 210 if (!strcmp(name, q->id)) { 211 if (!bpf_try_module_get(q, q->owner)) 212 q = NULL; 213 break; 214 } 215 } 216 217 return q; 218 } 219 220 /* Set new default qdisc to use */ 221 int qdisc_set_default(const char *name) 222 { 223 const struct Qdisc_ops *ops; 224 225 if (!capable(CAP_NET_ADMIN)) 226 return -EPERM; 227 228 write_lock(&qdisc_mod_lock); 229 ops = qdisc_lookup_default(name); 230 if (!ops) { 231 /* Not found, drop lock and try to load module */ 232 write_unlock(&qdisc_mod_lock); 233 request_module(NET_SCH_ALIAS_PREFIX "%s", name); 234 write_lock(&qdisc_mod_lock); 235 236 ops = qdisc_lookup_default(name); 237 } 238 239 if (ops) { 240 /* Set new default */ 241 bpf_module_put(default_qdisc_ops, default_qdisc_ops->owner); 242 default_qdisc_ops = ops; 243 } 244 write_unlock(&qdisc_mod_lock); 245 246 return ops ? 0 : -ENOENT; 247 } 248 249 #ifdef CONFIG_NET_SCH_DEFAULT 250 /* Set default value from kernel config */ 251 static int __init sch_default_qdisc(void) 252 { 253 return qdisc_set_default(CONFIG_DEFAULT_NET_SCH); 254 } 255 late_initcall(sch_default_qdisc); 256 #endif 257 258 /* We know handle. Find qdisc among all qdisc's attached to device 259 * (root qdisc, all its children, children of children etc.) 260 * Note: caller either uses rtnl or rcu_read_lock() 261 */ 262 263 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) 264 { 265 struct Qdisc *q; 266 267 if (!qdisc_dev(root)) 268 return (root->handle == handle ? root : NULL); 269 270 if (!(root->flags & TCQ_F_BUILTIN) && 271 root->handle == handle) 272 return root; 273 274 hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle, 275 lockdep_rtnl_is_held()) { 276 if (q->handle == handle) 277 return q; 278 } 279 return NULL; 280 } 281 282 void qdisc_hash_add(struct Qdisc *q, bool invisible) 283 { 284 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { 285 ASSERT_RTNL(); 286 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle); 287 if (invisible) 288 q->flags |= TCQ_F_INVISIBLE; 289 } 290 } 291 EXPORT_SYMBOL(qdisc_hash_add); 292 293 void qdisc_hash_del(struct Qdisc *q) 294 { 295 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { 296 ASSERT_RTNL(); 297 hash_del_rcu(&q->hash); 298 } 299 } 300 EXPORT_SYMBOL(qdisc_hash_del); 301 302 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) 303 { 304 struct Qdisc *q; 305 306 if (!handle) 307 return NULL; 308 q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle); 309 if (q) 310 goto out; 311 312 if (dev_ingress_queue(dev)) 313 q = qdisc_match_from_root( 314 rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping), 315 handle); 316 out: 317 return q; 318 } 319 320 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle) 321 { 322 struct netdev_queue *nq; 323 struct Qdisc *q; 324 325 if (!handle) 326 return NULL; 327 q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle); 328 if (q) 329 goto out; 330 331 nq = dev_ingress_queue_rcu(dev); 332 if (nq) 333 q = qdisc_match_from_root(rcu_dereference(nq->qdisc_sleeping), 334 handle); 335 out: 336 return q; 337 } 338 339 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid, 340 struct netlink_ext_ack *extack) 341 { 342 unsigned long cl; 343 const struct Qdisc_class_ops *cops = p->ops->cl_ops; 344 345 if (cops == NULL) { 346 NL_SET_ERR_MSG(extack, "Parent qdisc is not classful"); 347 return ERR_PTR(-EOPNOTSUPP); 348 } 349 cl = cops->find(p, classid); 350 351 if (cl == 0) { 352 NL_SET_ERR_MSG(extack, "Specified class not found"); 353 return ERR_PTR(-ENOENT); 354 } 355 return cops->leaf(p, cl); 356 } 357 358 /* Find queueing discipline by name */ 359 360 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind) 361 { 362 struct Qdisc_ops *q = NULL; 363 364 if (kind) { 365 read_lock(&qdisc_mod_lock); 366 for (q = qdisc_base; q; q = q->next) { 367 if (nla_strcmp(kind, q->id) == 0) { 368 if (!bpf_try_module_get(q, q->owner)) 369 q = NULL; 370 break; 371 } 372 } 373 read_unlock(&qdisc_mod_lock); 374 } 375 return q; 376 } 377 378 /* The linklayer setting were not transferred from iproute2, in older 379 * versions, and the rate tables lookup systems have been dropped in 380 * the kernel. To keep backward compatible with older iproute2 tc 381 * utils, we detect the linklayer setting by detecting if the rate 382 * table were modified. 383 * 384 * For linklayer ATM table entries, the rate table will be aligned to 385 * 48 bytes, thus some table entries will contain the same value. The 386 * mpu (min packet unit) is also encoded into the old rate table, thus 387 * starting from the mpu, we find low and high table entries for 388 * mapping this cell. If these entries contain the same value, when 389 * the rate tables have been modified for linklayer ATM. 390 * 391 * This is done by rounding mpu to the nearest 48 bytes cell/entry, 392 * and then roundup to the next cell, calc the table entry one below, 393 * and compare. 394 */ 395 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab) 396 { 397 int low = roundup(r->mpu, 48); 398 int high = roundup(low+1, 48); 399 int cell_low = low >> r->cell_log; 400 int cell_high = (high >> r->cell_log) - 1; 401 402 /* rtab is too inaccurate at rates > 100Mbit/s */ 403 if ((r->rate > (100000000/8)) || (rtab[0] == 0)) { 404 pr_debug("TC linklayer: Giving up ATM detection\n"); 405 return TC_LINKLAYER_ETHERNET; 406 } 407 408 if ((cell_high > cell_low) && (cell_high < 256) 409 && (rtab[cell_low] == rtab[cell_high])) { 410 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n", 411 cell_low, cell_high, rtab[cell_high]); 412 return TC_LINKLAYER_ATM; 413 } 414 return TC_LINKLAYER_ETHERNET; 415 } 416 417 static struct qdisc_rate_table *qdisc_rtab_list; 418 419 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, 420 struct nlattr *tab, 421 struct netlink_ext_ack *extack) 422 { 423 struct qdisc_rate_table *rtab; 424 425 if (tab == NULL || r->rate == 0 || 426 r->cell_log == 0 || r->cell_log >= 32 || 427 nla_len(tab) != TC_RTAB_SIZE) { 428 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching"); 429 return NULL; 430 } 431 432 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) { 433 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) && 434 !memcmp(&rtab->data, nla_data(tab), TC_RTAB_SIZE)) { 435 rtab->refcnt++; 436 return rtab; 437 } 438 } 439 440 rtab = kmalloc_obj(*rtab); 441 if (rtab) { 442 rtab->rate = *r; 443 rtab->refcnt = 1; 444 memcpy(rtab->data, nla_data(tab), TC_RTAB_SIZE); 445 if (r->linklayer == TC_LINKLAYER_UNAWARE) 446 r->linklayer = __detect_linklayer(r, rtab->data); 447 rtab->next = qdisc_rtab_list; 448 qdisc_rtab_list = rtab; 449 } else { 450 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table"); 451 } 452 return rtab; 453 } 454 EXPORT_SYMBOL(qdisc_get_rtab); 455 456 void qdisc_put_rtab(struct qdisc_rate_table *tab) 457 { 458 struct qdisc_rate_table *rtab, **rtabp; 459 460 if (!tab || --tab->refcnt) 461 return; 462 463 for (rtabp = &qdisc_rtab_list; 464 (rtab = *rtabp) != NULL; 465 rtabp = &rtab->next) { 466 if (rtab == tab) { 467 *rtabp = rtab->next; 468 kfree(rtab); 469 return; 470 } 471 } 472 } 473 EXPORT_SYMBOL(qdisc_put_rtab); 474 475 static LIST_HEAD(qdisc_stab_list); 476 477 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = { 478 [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) }, 479 [TCA_STAB_DATA] = { .type = NLA_BINARY }, 480 }; 481 482 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt, 483 struct netlink_ext_ack *extack) 484 { 485 struct nlattr *tb[TCA_STAB_MAX + 1]; 486 struct qdisc_size_table *stab; 487 struct tc_sizespec *s; 488 unsigned int tsize = 0; 489 u16 *tab = NULL; 490 int err; 491 492 err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy, 493 extack); 494 if (err < 0) 495 return ERR_PTR(err); 496 if (!tb[TCA_STAB_BASE]) { 497 NL_SET_ERR_MSG(extack, "Size table base attribute is missing"); 498 return ERR_PTR(-EINVAL); 499 } 500 501 s = nla_data(tb[TCA_STAB_BASE]); 502 503 if (s->tsize > 0) { 504 if (!tb[TCA_STAB_DATA]) { 505 NL_SET_ERR_MSG(extack, "Size table data attribute is missing"); 506 return ERR_PTR(-EINVAL); 507 } 508 tab = nla_data(tb[TCA_STAB_DATA]); 509 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16); 510 } 511 512 if (tsize != s->tsize || (!tab && tsize > 0)) { 513 NL_SET_ERR_MSG(extack, "Invalid size of size table"); 514 return ERR_PTR(-EINVAL); 515 } 516 517 list_for_each_entry(stab, &qdisc_stab_list, list) { 518 if (memcmp(&stab->szopts, s, sizeof(*s))) 519 continue; 520 if (tsize > 0 && 521 memcmp(stab->data, tab, flex_array_size(stab, data, tsize))) 522 continue; 523 stab->refcnt++; 524 return stab; 525 } 526 527 if (s->size_log > STAB_SIZE_LOG_MAX || 528 s->cell_log > STAB_SIZE_LOG_MAX) { 529 NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table"); 530 return ERR_PTR(-EINVAL); 531 } 532 533 stab = kmalloc_flex(*stab, data, tsize); 534 if (!stab) 535 return ERR_PTR(-ENOMEM); 536 537 stab->refcnt = 1; 538 stab->szopts = *s; 539 if (tsize > 0) 540 memcpy(stab->data, tab, flex_array_size(stab, data, tsize)); 541 542 list_add_tail(&stab->list, &qdisc_stab_list); 543 544 return stab; 545 } 546 547 void qdisc_put_stab(struct qdisc_size_table *tab) 548 { 549 if (!tab) 550 return; 551 552 if (--tab->refcnt == 0) { 553 list_del(&tab->list); 554 kfree_rcu(tab, rcu); 555 } 556 } 557 EXPORT_SYMBOL(qdisc_put_stab); 558 559 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab) 560 { 561 struct nlattr *nest; 562 563 nest = nla_nest_start_noflag(skb, TCA_STAB); 564 if (nest == NULL) 565 goto nla_put_failure; 566 if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts)) 567 goto nla_put_failure; 568 nla_nest_end(skb, nest); 569 570 return skb->len; 571 572 nla_put_failure: 573 return -1; 574 } 575 576 void __qdisc_calculate_pkt_len(struct sk_buff *skb, 577 const struct qdisc_size_table *stab) 578 { 579 int pkt_len, slot; 580 581 pkt_len = skb->len + stab->szopts.overhead; 582 if (unlikely(!stab->szopts.tsize)) 583 goto out; 584 585 slot = pkt_len + stab->szopts.cell_align; 586 if (unlikely(slot < 0)) 587 slot = 0; 588 589 slot >>= stab->szopts.cell_log; 590 if (likely(slot < stab->szopts.tsize)) 591 pkt_len = stab->data[slot]; 592 else 593 pkt_len = stab->data[stab->szopts.tsize - 1] * 594 (slot / stab->szopts.tsize) + 595 stab->data[slot % stab->szopts.tsize]; 596 597 pkt_len <<= stab->szopts.size_log; 598 out: 599 if (unlikely(pkt_len < 1)) 600 pkt_len = 1; 601 qdisc_skb_cb(skb)->pkt_len = pkt_len; 602 } 603 604 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) 605 { 606 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog, 607 timer); 608 609 rcu_read_lock(); 610 __netif_schedule(qdisc_root(wd->qdisc)); 611 rcu_read_unlock(); 612 613 return HRTIMER_NORESTART; 614 } 615 616 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, 617 clockid_t clockid) 618 { 619 hrtimer_setup(&wd->timer, qdisc_watchdog, clockid, HRTIMER_MODE_ABS_PINNED); 620 wd->qdisc = qdisc; 621 } 622 EXPORT_SYMBOL(qdisc_watchdog_init_clockid); 623 624 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) 625 { 626 qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC); 627 } 628 EXPORT_SYMBOL(qdisc_watchdog_init); 629 630 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires, 631 u64 delta_ns) 632 { 633 bool deactivated; 634 635 rcu_read_lock(); 636 deactivated = test_bit(__QDISC_STATE_DEACTIVATED, 637 &qdisc_root_sleeping(wd->qdisc)->state); 638 rcu_read_unlock(); 639 if (deactivated) 640 return; 641 642 if (hrtimer_is_queued(&wd->timer)) { 643 u64 softexpires; 644 645 softexpires = ktime_to_ns(hrtimer_get_softexpires(&wd->timer)); 646 /* If timer is already set in [expires, expires + delta_ns], 647 * do not reprogram it. 648 */ 649 if (softexpires - expires <= delta_ns) 650 return; 651 } 652 653 hrtimer_start_range_ns(&wd->timer, 654 ns_to_ktime(expires), 655 delta_ns, 656 HRTIMER_MODE_ABS_PINNED); 657 } 658 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns); 659 660 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd) 661 { 662 hrtimer_cancel(&wd->timer); 663 } 664 EXPORT_SYMBOL(qdisc_watchdog_cancel); 665 666 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n) 667 { 668 struct hlist_head *h; 669 unsigned int i; 670 671 h = kvmalloc_objs(struct hlist_head, n); 672 673 if (h != NULL) { 674 for (i = 0; i < n; i++) 675 INIT_HLIST_HEAD(&h[i]); 676 } 677 return h; 678 } 679 680 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash) 681 { 682 struct Qdisc_class_common *cl; 683 struct hlist_node *next; 684 struct hlist_head *nhash, *ohash; 685 unsigned int nsize, nmask, osize; 686 unsigned int i, h; 687 688 /* Rehash when load factor exceeds 0.75 */ 689 if (clhash->hashelems * 4 <= clhash->hashsize * 3) 690 return; 691 nsize = clhash->hashsize * 2; 692 nmask = nsize - 1; 693 nhash = qdisc_class_hash_alloc(nsize); 694 if (nhash == NULL) 695 return; 696 697 ohash = clhash->hash; 698 osize = clhash->hashsize; 699 700 sch_tree_lock(sch); 701 for (i = 0; i < osize; i++) { 702 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) { 703 h = qdisc_class_hash(cl->classid, nmask); 704 hlist_add_head(&cl->hnode, &nhash[h]); 705 } 706 } 707 clhash->hash = nhash; 708 clhash->hashsize = nsize; 709 clhash->hashmask = nmask; 710 sch_tree_unlock(sch); 711 712 kvfree(ohash); 713 } 714 EXPORT_SYMBOL(qdisc_class_hash_grow); 715 716 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash) 717 { 718 unsigned int size = 4; 719 720 clhash->hash = qdisc_class_hash_alloc(size); 721 if (!clhash->hash) 722 return -ENOMEM; 723 clhash->hashsize = size; 724 clhash->hashmask = size - 1; 725 clhash->hashelems = 0; 726 return 0; 727 } 728 EXPORT_SYMBOL(qdisc_class_hash_init); 729 730 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash) 731 { 732 kvfree(clhash->hash); 733 } 734 EXPORT_SYMBOL(qdisc_class_hash_destroy); 735 736 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash, 737 struct Qdisc_class_common *cl) 738 { 739 unsigned int h; 740 741 INIT_HLIST_NODE(&cl->hnode); 742 h = qdisc_class_hash(cl->classid, clhash->hashmask); 743 hlist_add_head(&cl->hnode, &clhash->hash[h]); 744 clhash->hashelems++; 745 } 746 EXPORT_SYMBOL(qdisc_class_hash_insert); 747 748 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash, 749 struct Qdisc_class_common *cl) 750 { 751 hlist_del(&cl->hnode); 752 clhash->hashelems--; 753 } 754 EXPORT_SYMBOL(qdisc_class_hash_remove); 755 756 /* Allocate an unique handle from space managed by kernel 757 * Possible range is [8000-FFFF]:0000 (0x8000 values) 758 */ 759 static u32 qdisc_alloc_handle(struct net_device *dev) 760 { 761 int i = 0x8000; 762 static u32 autohandle = TC_H_MAKE(0x80000000U, 0); 763 764 do { 765 autohandle += TC_H_MAKE(0x10000U, 0); 766 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0)) 767 autohandle = TC_H_MAKE(0x80000000U, 0); 768 if (!qdisc_lookup(dev, autohandle)) 769 return autohandle; 770 cond_resched(); 771 } while (--i > 0); 772 773 return 0; 774 } 775 776 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len) 777 { 778 const struct Qdisc_class_ops *cops; 779 unsigned long cl; 780 u32 parentid; 781 bool notify; 782 int drops; 783 784 drops = max_t(int, n, 0); 785 rcu_read_lock(); 786 while ((parentid = sch->parent)) { 787 if (parentid == TC_H_ROOT) 788 break; 789 790 if (sch->flags & TCQ_F_NOPARENT) 791 break; 792 /* Notify parent qdisc only if child qdisc becomes empty. */ 793 notify = !sch->q.qlen; 794 /* TODO: perform the search on a per txq basis */ 795 sch = qdisc_lookup_rcu(qdisc_dev(sch), TC_H_MAJ(parentid)); 796 if (sch == NULL) { 797 WARN_ON_ONCE(parentid != TC_H_ROOT); 798 break; 799 } 800 cops = sch->ops->cl_ops; 801 if (notify && cops->qlen_notify) { 802 /* Note that qlen_notify must be idempotent as it may get called 803 * multiple times. 804 */ 805 cl = cops->find(sch, parentid); 806 cops->qlen_notify(sch, cl); 807 } 808 WRITE_ONCE(sch->q.qlen, sch->q.qlen - n); 809 qstats_backlog_sub(sch, len); 810 __qdisc_qstats_drop(sch, drops); 811 } 812 rcu_read_unlock(); 813 } 814 EXPORT_SYMBOL(qdisc_tree_reduce_backlog); 815 816 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type, 817 void *type_data) 818 { 819 struct net_device *dev = qdisc_dev(sch); 820 int err; 821 822 sch->flags &= ~TCQ_F_OFFLOADED; 823 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) 824 return 0; 825 826 err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data); 827 if (err == -EOPNOTSUPP) 828 return 0; 829 830 if (!err) 831 sch->flags |= TCQ_F_OFFLOADED; 832 833 return err; 834 } 835 EXPORT_SYMBOL(qdisc_offload_dump_helper); 836 837 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch, 838 struct Qdisc *new, struct Qdisc *old, 839 enum tc_setup_type type, void *type_data, 840 struct netlink_ext_ack *extack) 841 { 842 bool any_qdisc_is_offloaded; 843 int err; 844 845 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) 846 return; 847 848 err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data); 849 850 /* Don't report error if the graft is part of destroy operation. */ 851 if (!err || !new || new == &noop_qdisc) 852 return; 853 854 /* Don't report error if the parent, the old child and the new 855 * one are not offloaded. 856 */ 857 any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED; 858 any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED; 859 any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED; 860 861 if (any_qdisc_is_offloaded) 862 NL_SET_ERR_MSG_WEAK(extack, "Offloading graft operation failed."); 863 } 864 EXPORT_SYMBOL(qdisc_offload_graft_helper); 865 866 void qdisc_offload_query_caps(struct net_device *dev, 867 enum tc_setup_type type, 868 void *caps, size_t caps_len) 869 { 870 const struct net_device_ops *ops = dev->netdev_ops; 871 struct tc_query_caps_base base = { 872 .type = type, 873 .caps = caps, 874 }; 875 876 memset(caps, 0, caps_len); 877 878 if (ops->ndo_setup_tc) 879 ops->ndo_setup_tc(dev, TC_QUERY_CAPS, &base); 880 } 881 EXPORT_SYMBOL(qdisc_offload_query_caps); 882 883 static void qdisc_offload_graft_root(struct net_device *dev, 884 struct Qdisc *new, struct Qdisc *old, 885 struct netlink_ext_ack *extack) 886 { 887 struct tc_root_qopt_offload graft_offload = { 888 .command = TC_ROOT_GRAFT, 889 .handle = new ? new->handle : 0, 890 .ingress = (new && new->flags & TCQ_F_INGRESS) || 891 (old && old->flags & TCQ_F_INGRESS), 892 }; 893 894 qdisc_offload_graft_helper(dev, NULL, new, old, 895 TC_SETUP_ROOT_QDISC, &graft_offload, extack); 896 } 897 898 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, 899 u32 portid, u32 seq, u16 flags, int event, 900 struct netlink_ext_ack *extack) 901 { 902 struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL; 903 struct gnet_stats_queue __percpu *cpu_qstats = NULL; 904 struct tcmsg *tcm; 905 struct nlmsghdr *nlh; 906 unsigned char *b = skb_tail_pointer(skb); 907 struct gnet_dump d; 908 struct qdisc_size_table *stab; 909 u32 block_index; 910 __u32 qlen; 911 912 cond_resched(); 913 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); 914 if (!nlh) 915 goto out_nlmsg_trim; 916 tcm = nlmsg_data(nlh); 917 tcm->tcm_family = AF_UNSPEC; 918 tcm->tcm__pad1 = 0; 919 tcm->tcm__pad2 = 0; 920 tcm->tcm_ifindex = qdisc_dev(q)->ifindex; 921 tcm->tcm_parent = clid; 922 tcm->tcm_handle = q->handle; 923 tcm->tcm_info = refcount_read(&q->refcnt); 924 if (nla_put_string(skb, TCA_KIND, q->ops->id)) 925 goto nla_put_failure; 926 if (q->ops->ingress_block_get) { 927 block_index = q->ops->ingress_block_get(q); 928 if (block_index && 929 nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index)) 930 goto nla_put_failure; 931 } 932 if (q->ops->egress_block_get) { 933 block_index = q->ops->egress_block_get(q); 934 if (block_index && 935 nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index)) 936 goto nla_put_failure; 937 } 938 if (q->ops->dump && q->ops->dump(q, skb) < 0) 939 goto nla_put_failure; 940 if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED))) 941 goto nla_put_failure; 942 qlen = qdisc_qlen_sum(q); 943 944 stab = rtnl_dereference(q->stab); 945 if (stab && qdisc_dump_stab(skb, stab) < 0) 946 goto nla_put_failure; 947 948 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, 949 NULL, &d, TCA_PAD) < 0) 950 goto nla_put_failure; 951 952 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) 953 goto nla_put_failure; 954 955 if (qdisc_is_percpu_stats(q)) { 956 cpu_bstats = q->cpu_bstats; 957 cpu_qstats = q->cpu_qstats; 958 } 959 960 if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 || 961 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 || 962 gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0) 963 goto nla_put_failure; 964 965 if (gnet_stats_finish_copy(&d) < 0) 966 goto nla_put_failure; 967 968 if (extack && extack->_msg && 969 nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) 970 goto out_nlmsg_trim; 971 972 nlh->nlmsg_len = skb_tail_pointer(skb) - b; 973 974 return skb->len; 975 976 out_nlmsg_trim: 977 nla_put_failure: 978 nlmsg_trim(skb, b); 979 return -EMSGSIZE; 980 } 981 982 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible, 983 const struct tcmsg *tcm) 984 { 985 if (q->flags & TCQ_F_BUILTIN) 986 return true; 987 if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible) 988 return true; 989 if (tcm) { 990 if (tcm->tcm_handle && tcm->tcm_handle != q->handle) 991 return true; 992 } 993 return false; 994 } 995 996 static int qdisc_get_notify(struct net *net, struct sk_buff *oskb, 997 struct nlmsghdr *n, u32 clid, struct Qdisc *q, 998 struct netlink_ext_ack *extack) 999 { 1000 struct sk_buff *skb; 1001 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; 1002 1003 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 1004 if (!skb) 1005 return -ENOBUFS; 1006 1007 if (!tc_qdisc_dump_ignore(q, false, NULL)) { 1008 if (tc_fill_qdisc(skb, q, clid, portid, n->nlmsg_seq, 0, 1009 RTM_NEWQDISC, extack) < 0) 1010 goto err_out; 1011 } 1012 1013 if (skb->len) 1014 return rtnetlink_send(skb, net, portid, RTNLGRP_TC, 1015 n->nlmsg_flags & NLM_F_ECHO); 1016 1017 err_out: 1018 kfree_skb(skb); 1019 return -EINVAL; 1020 } 1021 1022 static int qdisc_notify(struct net *net, struct sk_buff *oskb, 1023 struct nlmsghdr *n, u32 clid, 1024 struct Qdisc *old, struct Qdisc *new, 1025 struct netlink_ext_ack *extack) 1026 { 1027 struct sk_buff *skb; 1028 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; 1029 1030 if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) 1031 return 0; 1032 1033 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 1034 if (!skb) 1035 return -ENOBUFS; 1036 1037 if (old && !tc_qdisc_dump_ignore(old, false, NULL)) { 1038 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq, 1039 0, RTM_DELQDISC, extack) < 0) 1040 goto err_out; 1041 } 1042 if (new && !tc_qdisc_dump_ignore(new, false, NULL)) { 1043 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq, 1044 old ? NLM_F_REPLACE : 0, RTM_NEWQDISC, extack) < 0) 1045 goto err_out; 1046 } 1047 1048 if (skb->len) 1049 return rtnetlink_send(skb, net, portid, RTNLGRP_TC, 1050 n->nlmsg_flags & NLM_F_ECHO); 1051 1052 err_out: 1053 kfree_skb(skb); 1054 return -EINVAL; 1055 } 1056 1057 static void notify_and_destroy(struct net *net, struct sk_buff *skb, 1058 struct nlmsghdr *n, u32 clid, 1059 struct Qdisc *old, struct Qdisc *new, 1060 struct netlink_ext_ack *extack) 1061 { 1062 if (new || old) 1063 qdisc_notify(net, skb, n, clid, old, new, extack); 1064 1065 if (old) 1066 qdisc_put(old); 1067 } 1068 1069 static void qdisc_clear_nolock(struct Qdisc *sch) 1070 { 1071 sch->flags &= ~TCQ_F_NOLOCK; 1072 if (!(sch->flags & TCQ_F_CPUSTATS)) 1073 return; 1074 1075 free_percpu(sch->cpu_bstats); 1076 free_percpu(sch->cpu_qstats); 1077 sch->cpu_bstats = NULL; 1078 sch->cpu_qstats = NULL; 1079 sch->flags &= ~TCQ_F_CPUSTATS; 1080 } 1081 1082 /* Graft qdisc "new" to class "classid" of qdisc "parent" or 1083 * to device "dev". 1084 * 1085 * When appropriate send a netlink notification using 'skb' 1086 * and "n". 1087 * 1088 * On success, destroy old qdisc. 1089 */ 1090 1091 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, 1092 struct sk_buff *skb, struct nlmsghdr *n, u32 classid, 1093 struct Qdisc *new, struct Qdisc *old, 1094 struct netlink_ext_ack *extack) 1095 { 1096 struct Qdisc *q = old; 1097 struct net *net = dev_net(dev); 1098 1099 if (parent == NULL) { 1100 unsigned int i, num_q, ingress; 1101 struct netdev_queue *dev_queue; 1102 1103 ingress = 0; 1104 num_q = dev->num_tx_queues; 1105 if ((q && q->flags & TCQ_F_INGRESS) || 1106 (new && new->flags & TCQ_F_INGRESS)) { 1107 ingress = 1; 1108 dev_queue = dev_ingress_queue(dev); 1109 if (!dev_queue) { 1110 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue"); 1111 return -ENOENT; 1112 } 1113 1114 q = rtnl_dereference(dev_queue->qdisc_sleeping); 1115 1116 /* This is the counterpart of that qdisc_refcount_inc_nz() call in 1117 * __tcf_qdisc_find() for filter requests. 1118 */ 1119 if (!qdisc_refcount_dec_if_one(q)) { 1120 NL_SET_ERR_MSG(extack, 1121 "Current ingress or clsact Qdisc has ongoing filter requests"); 1122 return -EBUSY; 1123 } 1124 } 1125 1126 if (dev->flags & IFF_UP) 1127 dev_deactivate(dev, false); 1128 1129 qdisc_offload_graft_root(dev, new, old, extack); 1130 1131 if (new && new->ops->attach && !ingress) 1132 goto skip; 1133 1134 if (!ingress) { 1135 for (i = 0; i < num_q; i++) { 1136 dev_queue = netdev_get_tx_queue(dev, i); 1137 old = dev_graft_qdisc(dev_queue, new); 1138 1139 if (new && i > 0) 1140 qdisc_refcount_inc(new); 1141 qdisc_put(old); 1142 } 1143 } else { 1144 old = dev_graft_qdisc(dev_queue, NULL); 1145 1146 /* {ingress,clsact}_destroy() @old before grafting @new to avoid 1147 * unprotected concurrent accesses to net_device::miniq_{in,e}gress 1148 * pointer(s) in mini_qdisc_pair_swap(). 1149 */ 1150 qdisc_notify(net, skb, n, classid, old, new, extack); 1151 qdisc_destroy(old); 1152 1153 dev_graft_qdisc(dev_queue, new); 1154 } 1155 1156 skip: 1157 if (!ingress) { 1158 old = rtnl_dereference(dev->qdisc); 1159 if (new && !new->ops->attach) 1160 qdisc_refcount_inc(new); 1161 rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc); 1162 1163 notify_and_destroy(net, skb, n, classid, old, new, extack); 1164 1165 if (new && new->ops->attach) 1166 new->ops->attach(new); 1167 } 1168 1169 if (dev->flags & IFF_UP) 1170 dev_activate(dev); 1171 } else { 1172 const struct Qdisc_class_ops *cops = parent->ops->cl_ops; 1173 unsigned long cl; 1174 int err; 1175 1176 /* Only support running class lockless if parent is lockless */ 1177 if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK)) 1178 qdisc_clear_nolock(new); 1179 1180 if (!cops || !cops->graft) 1181 return -EOPNOTSUPP; 1182 1183 cl = cops->find(parent, classid); 1184 if (!cl) { 1185 NL_SET_ERR_MSG(extack, "Specified class not found"); 1186 return -ENOENT; 1187 } 1188 1189 if (new && new->ops == &noqueue_qdisc_ops) { 1190 NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class"); 1191 return -EINVAL; 1192 } 1193 1194 if (new && 1195 !(parent->flags & TCQ_F_MQROOT) && 1196 rcu_access_pointer(new->stab)) { 1197 NL_SET_ERR_MSG(extack, "STAB not supported on a non root"); 1198 return -EINVAL; 1199 } 1200 err = cops->graft(parent, cl, new, &old, extack); 1201 if (err) 1202 return err; 1203 notify_and_destroy(net, skb, n, classid, old, new, extack); 1204 } 1205 return 0; 1206 } 1207 1208 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca, 1209 struct netlink_ext_ack *extack) 1210 { 1211 u32 block_index; 1212 1213 if (tca[TCA_INGRESS_BLOCK]) { 1214 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]); 1215 1216 if (!block_index) { 1217 NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0"); 1218 return -EINVAL; 1219 } 1220 if (!sch->ops->ingress_block_set) { 1221 NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported"); 1222 return -EOPNOTSUPP; 1223 } 1224 sch->ops->ingress_block_set(sch, block_index); 1225 } 1226 if (tca[TCA_EGRESS_BLOCK]) { 1227 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]); 1228 1229 if (!block_index) { 1230 NL_SET_ERR_MSG(extack, "Egress block index cannot be 0"); 1231 return -EINVAL; 1232 } 1233 if (!sch->ops->egress_block_set) { 1234 NL_SET_ERR_MSG(extack, "Egress block sharing is not supported"); 1235 return -EOPNOTSUPP; 1236 } 1237 sch->ops->egress_block_set(sch, block_index); 1238 } 1239 return 0; 1240 } 1241 1242 /* 1243 Allocate and initialize new qdisc. 1244 1245 Parameters are passed via opt. 1246 */ 1247 1248 static struct Qdisc *qdisc_create(struct net_device *dev, 1249 struct netdev_queue *dev_queue, 1250 u32 parent, u32 handle, 1251 struct nlattr **tca, int *errp, 1252 struct netlink_ext_ack *extack) 1253 { 1254 int err; 1255 struct nlattr *kind = tca[TCA_KIND]; 1256 struct Qdisc *sch; 1257 struct Qdisc_ops *ops; 1258 struct qdisc_size_table *stab; 1259 1260 ops = qdisc_lookup_ops(kind); 1261 if (!ops) { 1262 err = -ENOENT; 1263 NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown"); 1264 goto err_out; 1265 } 1266 1267 sch = qdisc_alloc(dev_queue, ops, extack); 1268 if (IS_ERR(sch)) { 1269 err = PTR_ERR(sch); 1270 goto err_out2; 1271 } 1272 1273 sch->parent = parent; 1274 1275 if (handle == TC_H_INGRESS) { 1276 if (!(sch->flags & TCQ_F_INGRESS)) { 1277 NL_SET_ERR_MSG(extack, 1278 "Specified parent ID is reserved for ingress and clsact Qdiscs"); 1279 err = -EINVAL; 1280 goto err_out3; 1281 } 1282 handle = TC_H_MAKE(TC_H_INGRESS, 0); 1283 } else { 1284 if (handle == 0) { 1285 handle = qdisc_alloc_handle(dev); 1286 if (handle == 0) { 1287 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded"); 1288 err = -ENOSPC; 1289 goto err_out3; 1290 } 1291 } 1292 if (!netif_is_multiqueue(dev)) 1293 sch->flags |= TCQ_F_ONETXQUEUE; 1294 } 1295 1296 sch->handle = handle; 1297 1298 /* This exist to keep backward compatible with a userspace 1299 * loophole, what allowed userspace to get IFF_NO_QUEUE 1300 * facility on older kernels by setting tx_queue_len=0 (prior 1301 * to qdisc init), and then forgot to reinit tx_queue_len 1302 * before again attaching a qdisc. 1303 */ 1304 if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) { 1305 WRITE_ONCE(dev->tx_queue_len, DEFAULT_TX_QUEUE_LEN); 1306 netdev_info(dev, "Caught tx_queue_len zero misconfig\n"); 1307 } 1308 1309 err = qdisc_block_indexes_set(sch, tca, extack); 1310 if (err) 1311 goto err_out3; 1312 1313 if (tca[TCA_STAB]) { 1314 stab = qdisc_get_stab(tca[TCA_STAB], extack); 1315 if (IS_ERR(stab)) { 1316 err = PTR_ERR(stab); 1317 goto err_out3; 1318 } 1319 rcu_assign_pointer(sch->stab, stab); 1320 } 1321 1322 if (ops->init) { 1323 err = ops->init(sch, tca[TCA_OPTIONS], extack); 1324 if (err != 0) 1325 goto err_out4; 1326 } 1327 1328 if (tca[TCA_RATE]) { 1329 err = -EOPNOTSUPP; 1330 if (sch->flags & TCQ_F_MQROOT) { 1331 NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc"); 1332 goto err_out4; 1333 } 1334 1335 err = gen_new_estimator(&sch->bstats, 1336 sch->cpu_bstats, 1337 &sch->rate_est, 1338 NULL, 1339 true, 1340 tca[TCA_RATE]); 1341 if (err) { 1342 NL_SET_ERR_MSG(extack, "Failed to generate new estimator"); 1343 goto err_out4; 1344 } 1345 } 1346 1347 qdisc_hash_add(sch, false); 1348 trace_qdisc_create(ops, dev, parent); 1349 1350 return sch; 1351 1352 err_out4: 1353 /* Even if ops->init() failed, we call ops->destroy() 1354 * like qdisc_create_dflt(). 1355 */ 1356 if (ops->destroy) 1357 ops->destroy(sch); 1358 qdisc_put_stab(rtnl_dereference(sch->stab)); 1359 err_out3: 1360 qdisc_lock_uninit(sch, ops); 1361 netdev_put(dev, &sch->dev_tracker); 1362 qdisc_free(sch); 1363 err_out2: 1364 bpf_module_put(ops, ops->owner); 1365 err_out: 1366 *errp = err; 1367 return NULL; 1368 } 1369 1370 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca, 1371 struct netlink_ext_ack *extack) 1372 { 1373 struct qdisc_size_table *ostab, *stab = NULL; 1374 int err = 0; 1375 1376 if (tca[TCA_OPTIONS]) { 1377 if (!sch->ops->change) { 1378 NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc"); 1379 return -EINVAL; 1380 } 1381 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) { 1382 NL_SET_ERR_MSG(extack, "Change of blocks is not supported"); 1383 return -EOPNOTSUPP; 1384 } 1385 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack); 1386 if (err) 1387 return err; 1388 } 1389 1390 if (tca[TCA_STAB]) { 1391 stab = qdisc_get_stab(tca[TCA_STAB], extack); 1392 if (IS_ERR(stab)) 1393 return PTR_ERR(stab); 1394 } 1395 1396 ostab = rtnl_dereference(sch->stab); 1397 rcu_assign_pointer(sch->stab, stab); 1398 qdisc_put_stab(ostab); 1399 1400 if (tca[TCA_RATE]) { 1401 /* NB: ignores errors from replace_estimator 1402 because change can't be undone. */ 1403 if (sch->flags & TCQ_F_MQROOT) 1404 goto out; 1405 gen_replace_estimator(&sch->bstats, 1406 sch->cpu_bstats, 1407 &sch->rate_est, 1408 NULL, 1409 true, 1410 tca[TCA_RATE]); 1411 } 1412 out: 1413 return 0; 1414 } 1415 1416 struct check_loop_arg { 1417 struct qdisc_walker w; 1418 struct Qdisc *p; 1419 int depth; 1420 }; 1421 1422 static int check_loop_fn(struct Qdisc *q, unsigned long cl, 1423 struct qdisc_walker *w); 1424 1425 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth) 1426 { 1427 struct check_loop_arg arg; 1428 1429 if (q->ops->cl_ops == NULL) 1430 return 0; 1431 1432 arg.w.stop = arg.w.skip = arg.w.count = 0; 1433 arg.w.fn = check_loop_fn; 1434 arg.depth = depth; 1435 arg.p = p; 1436 q->ops->cl_ops->walk(q, &arg.w); 1437 return arg.w.stop ? -ELOOP : 0; 1438 } 1439 1440 static int 1441 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) 1442 { 1443 struct Qdisc *leaf; 1444 const struct Qdisc_class_ops *cops = q->ops->cl_ops; 1445 struct check_loop_arg *arg = (struct check_loop_arg *)w; 1446 1447 leaf = cops->leaf(q, cl); 1448 if (leaf) { 1449 if (leaf == arg->p || arg->depth > 7) 1450 return -ELOOP; 1451 return check_loop(leaf, arg->p, arg->depth + 1); 1452 } 1453 return 0; 1454 } 1455 1456 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = { 1457 [TCA_KIND] = { .type = NLA_STRING }, 1458 [TCA_RATE] = { .type = NLA_BINARY, 1459 .len = sizeof(struct tc_estimator) }, 1460 [TCA_STAB] = { .type = NLA_NESTED }, 1461 [TCA_DUMP_INVISIBLE] = { .type = NLA_FLAG }, 1462 [TCA_CHAIN] = { .type = NLA_U32 }, 1463 [TCA_INGRESS_BLOCK] = { .type = NLA_U32 }, 1464 [TCA_EGRESS_BLOCK] = { .type = NLA_U32 }, 1465 }; 1466 1467 /* 1468 * Delete/get qdisc. 1469 */ 1470 1471 static int __tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, 1472 struct netlink_ext_ack *extack, 1473 struct net_device *dev, 1474 struct nlattr *tca[TCA_MAX + 1], 1475 struct tcmsg *tcm) 1476 { 1477 struct net *net = sock_net(skb->sk); 1478 struct Qdisc *q = NULL; 1479 struct Qdisc *p = NULL; 1480 u32 clid; 1481 int err; 1482 1483 clid = tcm->tcm_parent; 1484 if (clid) { 1485 if (clid != TC_H_ROOT) { 1486 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) { 1487 p = qdisc_lookup(dev, TC_H_MAJ(clid)); 1488 if (!p) { 1489 NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid"); 1490 return -ENOENT; 1491 } 1492 q = qdisc_leaf(p, clid, extack); 1493 } else if (dev_ingress_queue(dev)) { 1494 q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping); 1495 } 1496 } else { 1497 q = rtnl_dereference(dev->qdisc); 1498 } 1499 if (!q) { 1500 NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device"); 1501 return -ENOENT; 1502 } 1503 if (IS_ERR(q)) 1504 return PTR_ERR(q); 1505 1506 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) { 1507 NL_SET_ERR_MSG(extack, "Invalid handle"); 1508 return -EINVAL; 1509 } 1510 } else { 1511 q = qdisc_lookup(dev, tcm->tcm_handle); 1512 if (!q) { 1513 NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle"); 1514 return -ENOENT; 1515 } 1516 } 1517 1518 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { 1519 NL_SET_ERR_MSG(extack, "Invalid qdisc name: must match existing qdisc"); 1520 return -EINVAL; 1521 } 1522 1523 if (n->nlmsg_type == RTM_DELQDISC) { 1524 if (!clid) { 1525 NL_SET_ERR_MSG(extack, "Classid cannot be zero"); 1526 return -EINVAL; 1527 } 1528 if (q->handle == 0) { 1529 NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero"); 1530 return -ENOENT; 1531 } 1532 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack); 1533 if (err != 0) 1534 return err; 1535 } else { 1536 qdisc_get_notify(net, skb, n, clid, q, NULL); 1537 } 1538 return 0; 1539 } 1540 1541 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, 1542 struct netlink_ext_ack *extack) 1543 { 1544 struct net *net = sock_net(skb->sk); 1545 struct tcmsg *tcm = nlmsg_data(n); 1546 struct nlattr *tca[TCA_MAX + 1]; 1547 struct net_device *dev; 1548 int err; 1549 1550 err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, 1551 rtm_tca_policy, extack); 1552 if (err < 0) 1553 return err; 1554 1555 dev = __dev_get_by_index(net, tcm->tcm_ifindex); 1556 if (!dev) 1557 return -ENODEV; 1558 1559 netdev_lock_ops(dev); 1560 err = __tc_get_qdisc(skb, n, extack, dev, tca, tcm); 1561 netdev_unlock_ops(dev); 1562 1563 return err; 1564 } 1565 1566 static bool req_create_or_replace(struct nlmsghdr *n) 1567 { 1568 return (n->nlmsg_flags & NLM_F_CREATE && 1569 n->nlmsg_flags & NLM_F_REPLACE); 1570 } 1571 1572 static bool req_create_exclusive(struct nlmsghdr *n) 1573 { 1574 return (n->nlmsg_flags & NLM_F_CREATE && 1575 n->nlmsg_flags & NLM_F_EXCL); 1576 } 1577 1578 static bool req_change(struct nlmsghdr *n) 1579 { 1580 return (!(n->nlmsg_flags & NLM_F_CREATE) && 1581 !(n->nlmsg_flags & NLM_F_REPLACE) && 1582 !(n->nlmsg_flags & NLM_F_EXCL)); 1583 } 1584 1585 static int __tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, 1586 struct netlink_ext_ack *extack, 1587 struct net_device *dev, 1588 struct nlattr *tca[TCA_MAX + 1], 1589 struct tcmsg *tcm) 1590 { 1591 struct Qdisc *q = NULL; 1592 struct Qdisc *p = NULL; 1593 u32 clid; 1594 int err; 1595 1596 clid = tcm->tcm_parent; 1597 1598 if (clid) { 1599 if (clid != TC_H_ROOT) { 1600 if (clid != TC_H_INGRESS) { 1601 p = qdisc_lookup(dev, TC_H_MAJ(clid)); 1602 if (!p) { 1603 NL_SET_ERR_MSG(extack, "Failed to find specified qdisc"); 1604 return -ENOENT; 1605 } 1606 if (p->flags & TCQ_F_INGRESS) { 1607 NL_SET_ERR_MSG(extack, 1608 "Cannot add children to ingress/clsact qdisc"); 1609 return -EOPNOTSUPP; 1610 } 1611 q = qdisc_leaf(p, clid, extack); 1612 if (IS_ERR(q)) 1613 return PTR_ERR(q); 1614 } else if (dev_ingress_queue_create(dev)) { 1615 q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping); 1616 } 1617 } else { 1618 q = rtnl_dereference(dev->qdisc); 1619 } 1620 1621 /* It may be default qdisc, ignore it */ 1622 if (q && q->handle == 0) 1623 q = NULL; 1624 1625 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) { 1626 if (tcm->tcm_handle) { 1627 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) { 1628 NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override"); 1629 return -EEXIST; 1630 } 1631 if (TC_H_MIN(tcm->tcm_handle)) { 1632 NL_SET_ERR_MSG(extack, "Invalid minor handle"); 1633 return -EINVAL; 1634 } 1635 q = qdisc_lookup(dev, tcm->tcm_handle); 1636 if (!q) 1637 goto create_n_graft; 1638 if (q->parent != tcm->tcm_parent) { 1639 NL_SET_ERR_MSG(extack, "Cannot move an existing qdisc to a different parent"); 1640 return -EINVAL; 1641 } 1642 if (n->nlmsg_flags & NLM_F_EXCL) { 1643 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override"); 1644 return -EEXIST; 1645 } 1646 if (tca[TCA_KIND] && 1647 nla_strcmp(tca[TCA_KIND], q->ops->id)) { 1648 NL_SET_ERR_MSG(extack, "Invalid qdisc name: must match existing qdisc"); 1649 return -EINVAL; 1650 } 1651 if (q->flags & TCQ_F_INGRESS) { 1652 NL_SET_ERR_MSG(extack, 1653 "Cannot regraft ingress or clsact Qdiscs"); 1654 return -EINVAL; 1655 } 1656 if (q == p || 1657 (p && check_loop(q, p, 0))) { 1658 NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected"); 1659 return -ELOOP; 1660 } 1661 if (clid == TC_H_INGRESS) { 1662 NL_SET_ERR_MSG(extack, "Ingress cannot graft directly"); 1663 return -EINVAL; 1664 } 1665 qdisc_refcount_inc(q); 1666 goto graft; 1667 } else { 1668 if (!q) 1669 goto create_n_graft; 1670 1671 /* This magic test requires explanation. 1672 * 1673 * We know, that some child q is already 1674 * attached to this parent and have choice: 1675 * 1) change it or 2) create/graft new one. 1676 * If the requested qdisc kind is different 1677 * than the existing one, then we choose graft. 1678 * If they are the same then this is "change" 1679 * operation - just let it fallthrough.. 1680 * 1681 * 1. We are allowed to create/graft only 1682 * if the request is explicitly stating 1683 * "please create if it doesn't exist". 1684 * 1685 * 2. If the request is to exclusive create 1686 * then the qdisc tcm_handle is not expected 1687 * to exist, so that we choose create/graft too. 1688 * 1689 * 3. The last case is when no flags are set. 1690 * This will happen when for example tc 1691 * utility issues a "change" command. 1692 * Alas, it is sort of hole in API, we 1693 * cannot decide what to do unambiguously. 1694 * For now we select create/graft. 1695 */ 1696 if (tca[TCA_KIND] && 1697 nla_strcmp(tca[TCA_KIND], q->ops->id)) { 1698 if (req_create_or_replace(n) || 1699 req_create_exclusive(n)) 1700 goto create_n_graft; 1701 else if (req_change(n)) 1702 goto create_n_graft2; 1703 } 1704 } 1705 } 1706 } else { 1707 if (!tcm->tcm_handle) { 1708 NL_SET_ERR_MSG(extack, "Handle cannot be zero"); 1709 return -EINVAL; 1710 } 1711 q = qdisc_lookup(dev, tcm->tcm_handle); 1712 } 1713 1714 /* Change qdisc parameters */ 1715 if (!q) { 1716 NL_SET_ERR_MSG(extack, "Specified qdisc not found"); 1717 return -ENOENT; 1718 } 1719 if (n->nlmsg_flags & NLM_F_EXCL) { 1720 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify"); 1721 return -EEXIST; 1722 } 1723 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { 1724 NL_SET_ERR_MSG(extack, "Invalid qdisc name: must match existing qdisc"); 1725 return -EINVAL; 1726 } 1727 err = qdisc_change(q, tca, extack); 1728 if (err == 0) 1729 qdisc_notify(sock_net(skb->sk), skb, n, clid, NULL, q, extack); 1730 return err; 1731 1732 create_n_graft: 1733 if (!(n->nlmsg_flags & NLM_F_CREATE)) { 1734 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag"); 1735 return -ENOENT; 1736 } 1737 create_n_graft2: 1738 if (clid == TC_H_INGRESS) { 1739 if (dev_ingress_queue(dev)) { 1740 q = qdisc_create(dev, dev_ingress_queue(dev), 1741 tcm->tcm_parent, tcm->tcm_parent, 1742 tca, &err, extack); 1743 } else { 1744 NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device"); 1745 err = -ENOENT; 1746 } 1747 } else { 1748 struct netdev_queue *dev_queue; 1749 1750 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue) 1751 dev_queue = p->ops->cl_ops->select_queue(p, tcm); 1752 else if (p) 1753 dev_queue = p->dev_queue; 1754 else 1755 dev_queue = netdev_get_tx_queue(dev, 0); 1756 1757 q = qdisc_create(dev, dev_queue, 1758 tcm->tcm_parent, tcm->tcm_handle, 1759 tca, &err, extack); 1760 } 1761 if (!q) 1762 return err; 1763 1764 graft: 1765 err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack); 1766 if (err) { 1767 if (q) 1768 qdisc_put(q); 1769 return err; 1770 } 1771 1772 return 0; 1773 } 1774 1775 static void request_qdisc_module(struct nlattr *kind) 1776 { 1777 struct Qdisc_ops *ops; 1778 char name[IFNAMSIZ]; 1779 1780 if (!kind) 1781 return; 1782 1783 ops = qdisc_lookup_ops(kind); 1784 if (ops) { 1785 bpf_module_put(ops, ops->owner); 1786 return; 1787 } 1788 1789 if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) { 1790 rtnl_unlock(); 1791 request_module(NET_SCH_ALIAS_PREFIX "%s", name); 1792 rtnl_lock(); 1793 } 1794 } 1795 1796 /* 1797 * Create/change qdisc. 1798 */ 1799 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, 1800 struct netlink_ext_ack *extack) 1801 { 1802 struct net *net = sock_net(skb->sk); 1803 struct nlattr *tca[TCA_MAX + 1]; 1804 struct net_device *dev; 1805 struct tcmsg *tcm; 1806 int err; 1807 1808 err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, 1809 rtm_tca_policy, extack); 1810 if (err < 0) 1811 return err; 1812 1813 request_qdisc_module(tca[TCA_KIND]); 1814 1815 tcm = nlmsg_data(n); 1816 dev = __dev_get_by_index(net, tcm->tcm_ifindex); 1817 if (!dev) 1818 return -ENODEV; 1819 1820 netdev_lock_ops(dev); 1821 err = __tc_modify_qdisc(skb, n, extack, dev, tca, tcm); 1822 netdev_unlock_ops(dev); 1823 1824 return err; 1825 } 1826 1827 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, 1828 struct netlink_callback *cb, 1829 int *q_idx_p, int s_q_idx, bool recur, 1830 bool dump_invisible) 1831 { 1832 const struct nlmsghdr *nlh = cb->nlh; 1833 int ret = 0, q_idx = *q_idx_p; 1834 const struct tcmsg *tcm; 1835 struct Qdisc *q; 1836 int b; 1837 1838 if (!root) 1839 return 0; 1840 1841 tcm = nlmsg_data(nlh); 1842 q = root; 1843 if (q_idx < s_q_idx) { 1844 q_idx++; 1845 } else { 1846 if (!tc_qdisc_dump_ignore(q, dump_invisible, tcm)) 1847 ret = tc_fill_qdisc(skb, q, q->parent, 1848 NETLINK_CB(cb->skb).portid, 1849 nlh->nlmsg_seq, NLM_F_MULTI, 1850 RTM_NEWQDISC, NULL); 1851 if (ret < 0) 1852 goto out; 1853 q_idx++; 1854 } 1855 1856 /* If dumping singletons, there is no qdisc_dev(root) and the singleton 1857 * itself has already been dumped. 1858 * 1859 * If we've already dumped the top-level (ingress) qdisc above and the global 1860 * qdisc hashtable, we don't want to hit it again 1861 */ 1862 if (!qdisc_dev(root) || !recur) 1863 goto out; 1864 1865 hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) { 1866 if (q_idx < s_q_idx) { 1867 q_idx++; 1868 continue; 1869 } 1870 if (!tc_qdisc_dump_ignore(q, dump_invisible, tcm)) 1871 ret = tc_fill_qdisc(skb, q, q->parent, 1872 NETLINK_CB(cb->skb).portid, 1873 nlh->nlmsg_seq, NLM_F_MULTI, 1874 RTM_NEWQDISC, NULL); 1875 if (ret < 0) 1876 goto out; 1877 q_idx++; 1878 } 1879 1880 out: 1881 *q_idx_p = q_idx; 1882 return ret; 1883 } 1884 1885 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) 1886 { 1887 const struct nlmsghdr *nlh = cb->nlh; 1888 struct net *net = sock_net(skb->sk); 1889 struct nlattr *tca[TCA_MAX + 1]; 1890 struct { 1891 unsigned long ifindex; 1892 int q_idx; 1893 } *ctx = (void *)cb->ctx; 1894 const struct tcmsg *tcm; 1895 struct net_device *dev; 1896 int s_q_idx, q_idx; 1897 int err; 1898 1899 ASSERT_RTNL(); 1900 1901 err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX, 1902 rtm_tca_policy, cb->extack); 1903 if (err < 0) 1904 return err; 1905 tcm = nlmsg_data(nlh); 1906 if (tcm->tcm_ifindex && !ctx->ifindex) 1907 ctx->ifindex = tcm->tcm_ifindex; 1908 1909 s_q_idx = ctx->q_idx; 1910 1911 for_each_netdev_dump(net, dev, ctx->ifindex) { 1912 struct netdev_queue *dev_queue; 1913 struct Qdisc *q; 1914 1915 if (tcm->tcm_ifindex && ctx->ifindex != tcm->tcm_ifindex) 1916 break; 1917 1918 q_idx = 0; 1919 1920 netdev_lock_ops(dev); 1921 q = rtnl_dereference(dev->qdisc); 1922 err = tc_dump_qdisc_root(q, skb, cb, &q_idx, s_q_idx, 1923 true, tca[TCA_DUMP_INVISIBLE]); 1924 if (err < 0) 1925 goto error_unlock; 1926 1927 dev_queue = dev_ingress_queue(dev); 1928 if (dev_queue) { 1929 q = rtnl_dereference(dev_queue->qdisc_sleeping); 1930 err = tc_dump_qdisc_root(q, skb, cb, &q_idx, s_q_idx, 1931 false, tca[TCA_DUMP_INVISIBLE]); 1932 if (err < 0) 1933 goto error_unlock; 1934 } 1935 netdev_unlock_ops(dev); 1936 s_q_idx = 0; 1937 } 1938 return skb->len; 1939 1940 error_unlock: 1941 netdev_unlock_ops(dev); 1942 ctx->q_idx = q_idx; 1943 1944 return err; 1945 } 1946 1947 1948 1949 /************************************************ 1950 * Traffic classes manipulation. * 1951 ************************************************/ 1952 1953 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, 1954 unsigned long cl, u32 portid, u32 seq, u16 flags, 1955 int event, struct netlink_ext_ack *extack) 1956 { 1957 struct tcmsg *tcm; 1958 struct nlmsghdr *nlh; 1959 unsigned char *b = skb_tail_pointer(skb); 1960 struct gnet_dump d; 1961 const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops; 1962 1963 cond_resched(); 1964 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); 1965 if (!nlh) 1966 goto out_nlmsg_trim; 1967 tcm = nlmsg_data(nlh); 1968 tcm->tcm_family = AF_UNSPEC; 1969 tcm->tcm__pad1 = 0; 1970 tcm->tcm__pad2 = 0; 1971 tcm->tcm_ifindex = qdisc_dev(q)->ifindex; 1972 tcm->tcm_parent = q->handle; 1973 tcm->tcm_handle = q->handle; 1974 tcm->tcm_info = 0; 1975 if (nla_put_string(skb, TCA_KIND, q->ops->id)) 1976 goto nla_put_failure; 1977 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0) 1978 goto nla_put_failure; 1979 1980 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, 1981 NULL, &d, TCA_PAD) < 0) 1982 goto nla_put_failure; 1983 1984 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0) 1985 goto nla_put_failure; 1986 1987 if (gnet_stats_finish_copy(&d) < 0) 1988 goto nla_put_failure; 1989 1990 if (extack && extack->_msg && 1991 nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) 1992 goto out_nlmsg_trim; 1993 1994 nlh->nlmsg_len = skb_tail_pointer(skb) - b; 1995 1996 return skb->len; 1997 1998 out_nlmsg_trim: 1999 nla_put_failure: 2000 nlmsg_trim(skb, b); 2001 return -EMSGSIZE; 2002 } 2003 2004 static int tclass_notify(struct net *net, struct sk_buff *oskb, 2005 struct nlmsghdr *n, struct Qdisc *q, 2006 unsigned long cl, int event, struct netlink_ext_ack *extack) 2007 { 2008 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; 2009 struct sk_buff *skb; 2010 int ret; 2011 2012 if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) 2013 return 0; 2014 2015 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 2016 if (!skb) 2017 return -ENOBUFS; 2018 2019 ret = tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event, extack); 2020 if (ret < 0) { 2021 kfree_skb(skb); 2022 return ret; 2023 } 2024 2025 return rtnetlink_send(skb, net, portid, RTNLGRP_TC, 2026 n->nlmsg_flags & NLM_F_ECHO); 2027 } 2028 2029 static int tclass_get_notify(struct net *net, struct sk_buff *oskb, 2030 struct nlmsghdr *n, struct Qdisc *q, 2031 unsigned long cl, struct netlink_ext_ack *extack) 2032 { 2033 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; 2034 struct sk_buff *skb; 2035 int ret; 2036 2037 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 2038 if (!skb) 2039 return -ENOBUFS; 2040 2041 ret = tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, 2042 RTM_NEWTCLASS, extack); 2043 if (ret < 0) { 2044 kfree_skb(skb); 2045 return ret; 2046 } 2047 2048 return rtnetlink_send(skb, net, portid, RTNLGRP_TC, 2049 n->nlmsg_flags & NLM_F_ECHO); 2050 } 2051 2052 static int tclass_del_notify(struct net *net, 2053 const struct Qdisc_class_ops *cops, 2054 struct sk_buff *oskb, struct nlmsghdr *n, 2055 struct Qdisc *q, unsigned long cl, 2056 struct netlink_ext_ack *extack) 2057 { 2058 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; 2059 struct sk_buff *skb = NULL; 2060 int err = 0; 2061 2062 if (!cops->delete) 2063 return -EOPNOTSUPP; 2064 2065 if (rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) { 2066 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 2067 if (!skb) 2068 return -ENOBUFS; 2069 2070 err = tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, 2071 RTM_DELTCLASS, extack); 2072 if (err < 0) { 2073 kfree_skb(skb); 2074 return err; 2075 } 2076 } 2077 2078 err = cops->delete(q, cl, extack); 2079 if (err) { 2080 kfree_skb(skb); 2081 return err; 2082 } 2083 2084 err = rtnetlink_maybe_send(skb, net, portid, RTNLGRP_TC, 2085 n->nlmsg_flags & NLM_F_ECHO); 2086 return err; 2087 } 2088 2089 #ifdef CONFIG_NET_CLS 2090 2091 struct tcf_bind_args { 2092 struct tcf_walker w; 2093 unsigned long base; 2094 unsigned long cl; 2095 u32 classid; 2096 }; 2097 2098 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg) 2099 { 2100 struct tcf_bind_args *a = (void *)arg; 2101 2102 if (n && tp->ops->bind_class) { 2103 struct Qdisc *q = tcf_block_q(tp->chain->block); 2104 2105 sch_tree_lock(q); 2106 tp->ops->bind_class(n, a->classid, a->cl, q, a->base); 2107 sch_tree_unlock(q); 2108 } 2109 return 0; 2110 } 2111 2112 struct tc_bind_class_args { 2113 struct qdisc_walker w; 2114 unsigned long new_cl; 2115 u32 portid; 2116 u32 clid; 2117 }; 2118 2119 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl, 2120 struct qdisc_walker *w) 2121 { 2122 struct tc_bind_class_args *a = (struct tc_bind_class_args *)w; 2123 const struct Qdisc_class_ops *cops = q->ops->cl_ops; 2124 struct tcf_block *block; 2125 struct tcf_chain *chain; 2126 2127 block = cops->tcf_block(q, cl, NULL); 2128 if (!block) 2129 return 0; 2130 for (chain = tcf_get_next_chain(block, NULL); 2131 chain; 2132 chain = tcf_get_next_chain(block, chain)) { 2133 struct tcf_proto *tp; 2134 2135 for (tp = tcf_get_next_proto(chain, NULL); 2136 tp; tp = tcf_get_next_proto(chain, tp)) { 2137 struct tcf_bind_args arg = {}; 2138 2139 arg.w.fn = tcf_node_bind; 2140 arg.classid = a->clid; 2141 arg.base = cl; 2142 arg.cl = a->new_cl; 2143 tp->ops->walk(tp, &arg.w, true); 2144 } 2145 } 2146 2147 return 0; 2148 } 2149 2150 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, 2151 unsigned long new_cl) 2152 { 2153 const struct Qdisc_class_ops *cops = q->ops->cl_ops; 2154 struct tc_bind_class_args args = {}; 2155 2156 if (!cops->tcf_block) 2157 return; 2158 args.portid = portid; 2159 args.clid = clid; 2160 args.new_cl = new_cl; 2161 args.w.fn = tc_bind_class_walker; 2162 q->ops->cl_ops->walk(q, &args.w); 2163 } 2164 2165 #else 2166 2167 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, 2168 unsigned long new_cl) 2169 { 2170 } 2171 2172 #endif 2173 2174 static int __tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, 2175 struct netlink_ext_ack *extack, 2176 struct net_device *dev, 2177 struct nlattr *tca[TCA_MAX + 1], 2178 struct tcmsg *tcm) 2179 { 2180 struct net *net = sock_net(skb->sk); 2181 const struct Qdisc_class_ops *cops; 2182 struct Qdisc *q = NULL; 2183 unsigned long cl = 0; 2184 unsigned long new_cl; 2185 u32 portid; 2186 u32 clid; 2187 u32 qid; 2188 int err; 2189 2190 /* 2191 parent == TC_H_UNSPEC - unspecified parent. 2192 parent == TC_H_ROOT - class is root, which has no parent. 2193 parent == X:0 - parent is root class. 2194 parent == X:Y - parent is a node in hierarchy. 2195 parent == 0:Y - parent is X:Y, where X:0 is qdisc. 2196 2197 handle == 0:0 - generate handle from kernel pool. 2198 handle == 0:Y - class is X:Y, where X:0 is qdisc. 2199 handle == X:Y - clear. 2200 handle == X:0 - root class. 2201 */ 2202 2203 /* Step 1. Determine qdisc handle X:0 */ 2204 2205 portid = tcm->tcm_parent; 2206 clid = tcm->tcm_handle; 2207 qid = TC_H_MAJ(clid); 2208 2209 if (portid != TC_H_ROOT) { 2210 u32 qid1 = TC_H_MAJ(portid); 2211 2212 if (qid && qid1) { 2213 /* If both majors are known, they must be identical. */ 2214 if (qid != qid1) 2215 return -EINVAL; 2216 } else if (qid1) { 2217 qid = qid1; 2218 } else if (qid == 0) 2219 qid = rtnl_dereference(dev->qdisc)->handle; 2220 2221 /* Now qid is genuine qdisc handle consistent 2222 * both with parent and child. 2223 * 2224 * TC_H_MAJ(portid) still may be unspecified, complete it now. 2225 */ 2226 if (portid) 2227 portid = TC_H_MAKE(qid, portid); 2228 } else { 2229 if (qid == 0) 2230 qid = rtnl_dereference(dev->qdisc)->handle; 2231 } 2232 2233 /* OK. Locate qdisc */ 2234 q = qdisc_lookup(dev, qid); 2235 if (!q) 2236 return -ENOENT; 2237 2238 /* An check that it supports classes */ 2239 cops = q->ops->cl_ops; 2240 if (cops == NULL) 2241 return -EINVAL; 2242 2243 /* Now try to get class */ 2244 if (clid == 0) { 2245 if (portid == TC_H_ROOT) 2246 clid = qid; 2247 } else 2248 clid = TC_H_MAKE(qid, clid); 2249 2250 if (clid) 2251 cl = cops->find(q, clid); 2252 2253 if (cl == 0) { 2254 err = -ENOENT; 2255 if (n->nlmsg_type != RTM_NEWTCLASS || 2256 !(n->nlmsg_flags & NLM_F_CREATE)) 2257 goto out; 2258 } else { 2259 switch (n->nlmsg_type) { 2260 case RTM_NEWTCLASS: 2261 err = -EEXIST; 2262 if (n->nlmsg_flags & NLM_F_EXCL) 2263 goto out; 2264 break; 2265 case RTM_DELTCLASS: 2266 err = tclass_del_notify(net, cops, skb, n, q, cl, extack); 2267 /* Unbind the class with flilters with 0 */ 2268 tc_bind_tclass(q, portid, clid, 0); 2269 goto out; 2270 case RTM_GETTCLASS: 2271 err = tclass_get_notify(net, skb, n, q, cl, extack); 2272 goto out; 2273 default: 2274 err = -EINVAL; 2275 goto out; 2276 } 2277 } 2278 2279 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) { 2280 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes"); 2281 return -EOPNOTSUPP; 2282 } 2283 2284 /* Prevent creation of traffic classes with classid TC_H_ROOT */ 2285 if (clid == TC_H_ROOT) { 2286 NL_SET_ERR_MSG(extack, "Cannot create traffic class with classid TC_H_ROOT"); 2287 return -EINVAL; 2288 } 2289 2290 new_cl = cl; 2291 err = -EOPNOTSUPP; 2292 if (cops->change) 2293 err = cops->change(q, clid, portid, tca, &new_cl, extack); 2294 if (err == 0) { 2295 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS, extack); 2296 /* We just create a new class, need to do reverse binding. */ 2297 if (cl != new_cl) 2298 tc_bind_tclass(q, portid, clid, new_cl); 2299 } 2300 out: 2301 return err; 2302 } 2303 2304 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, 2305 struct netlink_ext_ack *extack) 2306 { 2307 struct net *net = sock_net(skb->sk); 2308 struct tcmsg *tcm = nlmsg_data(n); 2309 struct nlattr *tca[TCA_MAX + 1]; 2310 struct net_device *dev; 2311 int err; 2312 2313 err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, 2314 rtm_tca_policy, extack); 2315 if (err < 0) 2316 return err; 2317 2318 dev = __dev_get_by_index(net, tcm->tcm_ifindex); 2319 if (!dev) 2320 return -ENODEV; 2321 2322 netdev_lock_ops(dev); 2323 err = __tc_ctl_tclass(skb, n, extack, dev, tca, tcm); 2324 netdev_unlock_ops(dev); 2325 2326 return err; 2327 } 2328 2329 struct qdisc_dump_args { 2330 struct qdisc_walker w; 2331 struct sk_buff *skb; 2332 struct netlink_callback *cb; 2333 }; 2334 2335 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, 2336 struct qdisc_walker *arg) 2337 { 2338 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg; 2339 2340 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid, 2341 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, 2342 RTM_NEWTCLASS, NULL); 2343 } 2344 2345 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb, 2346 struct tcmsg *tcm, struct netlink_callback *cb, 2347 int *t_p, int s_t) 2348 { 2349 struct qdisc_dump_args arg; 2350 2351 if (tc_qdisc_dump_ignore(q, false, NULL) || 2352 *t_p < s_t || !q->ops->cl_ops || 2353 (tcm->tcm_parent && 2354 TC_H_MAJ(tcm->tcm_parent) != q->handle)) { 2355 (*t_p)++; 2356 return 0; 2357 } 2358 if (*t_p > s_t) 2359 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); 2360 arg.w.fn = qdisc_class_dump; 2361 arg.skb = skb; 2362 arg.cb = cb; 2363 arg.w.stop = 0; 2364 arg.w.skip = cb->args[1]; 2365 arg.w.count = 0; 2366 q->ops->cl_ops->walk(q, &arg.w); 2367 cb->args[1] = arg.w.count; 2368 if (arg.w.stop) 2369 return -1; 2370 (*t_p)++; 2371 return 0; 2372 } 2373 2374 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb, 2375 struct tcmsg *tcm, struct netlink_callback *cb, 2376 int *t_p, int s_t, bool recur) 2377 { 2378 struct Qdisc *q; 2379 int b; 2380 2381 if (!root) 2382 return 0; 2383 2384 if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0) 2385 return -1; 2386 2387 if (!qdisc_dev(root) || !recur) 2388 return 0; 2389 2390 if (tcm->tcm_parent) { 2391 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent)); 2392 if (q && q != root && 2393 tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0) 2394 return -1; 2395 return 0; 2396 } 2397 hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) { 2398 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0) 2399 return -1; 2400 } 2401 2402 return 0; 2403 } 2404 2405 static int __tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb, 2406 struct tcmsg *tcm, struct net_device *dev) 2407 { 2408 struct netdev_queue *dev_queue; 2409 int t, s_t; 2410 2411 s_t = cb->args[0]; 2412 t = 0; 2413 2414 if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc), 2415 skb, tcm, cb, &t, s_t, true) < 0) 2416 goto done; 2417 2418 dev_queue = dev_ingress_queue(dev); 2419 if (dev_queue && 2420 tc_dump_tclass_root(rtnl_dereference(dev_queue->qdisc_sleeping), 2421 skb, tcm, cb, &t, s_t, false) < 0) 2422 goto done; 2423 2424 done: 2425 cb->args[0] = t; 2426 2427 return skb->len; 2428 } 2429 2430 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) 2431 { 2432 struct tcmsg *tcm = nlmsg_data(cb->nlh); 2433 struct net *net = sock_net(skb->sk); 2434 struct net_device *dev; 2435 int err; 2436 2437 if (nlmsg_len(cb->nlh) < sizeof(*tcm)) 2438 return 0; 2439 2440 dev = dev_get_by_index(net, tcm->tcm_ifindex); 2441 if (!dev) 2442 return 0; 2443 2444 netdev_lock_ops(dev); 2445 err = __tc_dump_tclass(skb, cb, tcm, dev); 2446 netdev_unlock_ops(dev); 2447 2448 dev_put(dev); 2449 2450 return err; 2451 } 2452 2453 #ifdef CONFIG_PROC_FS 2454 static int psched_show(struct seq_file *seq, void *v) 2455 { 2456 seq_printf(seq, "%08x %08x %08x %08x\n", 2457 (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1), 2458 1000000, 2459 (u32)NSEC_PER_SEC / hrtimer_resolution); 2460 2461 return 0; 2462 } 2463 2464 static int __net_init psched_net_init(struct net *net) 2465 { 2466 struct proc_dir_entry *e; 2467 2468 e = proc_create_single("psched", 0, net->proc_net, psched_show); 2469 if (e == NULL) 2470 return -ENOMEM; 2471 2472 return 0; 2473 } 2474 2475 static void __net_exit psched_net_exit(struct net *net) 2476 { 2477 remove_proc_entry("psched", net->proc_net); 2478 } 2479 #else 2480 static int __net_init psched_net_init(struct net *net) 2481 { 2482 return 0; 2483 } 2484 2485 static void __net_exit psched_net_exit(struct net *net) 2486 { 2487 } 2488 #endif 2489 2490 static struct pernet_operations psched_net_ops = { 2491 .init = psched_net_init, 2492 .exit = psched_net_exit, 2493 }; 2494 2495 #if IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) 2496 DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper_act); 2497 DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper_cls); 2498 #endif 2499 2500 static const struct rtnl_msg_handler psched_rtnl_msg_handlers[] __initconst = { 2501 {.msgtype = RTM_NEWQDISC, .doit = tc_modify_qdisc}, 2502 {.msgtype = RTM_DELQDISC, .doit = tc_get_qdisc}, 2503 {.msgtype = RTM_GETQDISC, .doit = tc_get_qdisc, 2504 .dumpit = tc_dump_qdisc}, 2505 {.msgtype = RTM_NEWTCLASS, .doit = tc_ctl_tclass}, 2506 {.msgtype = RTM_DELTCLASS, .doit = tc_ctl_tclass}, 2507 {.msgtype = RTM_GETTCLASS, .doit = tc_ctl_tclass, 2508 .dumpit = tc_dump_tclass}, 2509 }; 2510 2511 static int __init pktsched_init(void) 2512 { 2513 int err; 2514 2515 err = register_pernet_subsys(&psched_net_ops); 2516 if (err) { 2517 pr_err("pktsched_init: " 2518 "cannot initialize per netns operations\n"); 2519 return err; 2520 } 2521 2522 register_qdisc(&pfifo_fast_ops); 2523 register_qdisc(&pfifo_qdisc_ops); 2524 register_qdisc(&bfifo_qdisc_ops); 2525 register_qdisc(&pfifo_head_drop_qdisc_ops); 2526 register_qdisc(&mq_qdisc_ops); 2527 register_qdisc(&noqueue_qdisc_ops); 2528 2529 rtnl_register_many(psched_rtnl_msg_handlers); 2530 2531 tc_wrapper_init(); 2532 2533 return 0; 2534 } 2535 2536 subsys_initcall(pktsched_init); 2537