1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * net/sched/sch_api.c Packet scheduler API. 4 * 5 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 6 * 7 * Fixes: 8 * 9 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired. 10 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support 11 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support 12 */ 13 14 #include <linux/module.h> 15 #include <linux/types.h> 16 #include <linux/kernel.h> 17 #include <linux/string.h> 18 #include <linux/errno.h> 19 #include <linux/skbuff.h> 20 #include <linux/init.h> 21 #include <linux/proc_fs.h> 22 #include <linux/seq_file.h> 23 #include <linux/kmod.h> 24 #include <linux/list.h> 25 #include <linux/hrtimer.h> 26 #include <linux/slab.h> 27 #include <linux/hashtable.h> 28 #include <linux/bpf.h> 29 30 #include <net/netdev_lock.h> 31 #include <net/net_namespace.h> 32 #include <net/sock.h> 33 #include <net/netlink.h> 34 #include <net/pkt_sched.h> 35 #include <net/pkt_cls.h> 36 #include <net/tc_wrapper.h> 37 38 #include <trace/events/qdisc.h> 39 40 /* 41 42 Short review. 43 ------------- 44 45 This file consists of two interrelated parts: 46 47 1. queueing disciplines manager frontend. 48 2. traffic classes manager frontend. 49 50 Generally, queueing discipline ("qdisc") is a black box, 51 which is able to enqueue packets and to dequeue them (when 52 device is ready to send something) in order and at times 53 determined by algorithm hidden in it. 54 55 qdisc's are divided to two categories: 56 - "queues", which have no internal structure visible from outside. 57 - "schedulers", which split all the packets to "traffic classes", 58 using "packet classifiers" (look at cls_api.c) 59 60 In turn, classes may have child qdiscs (as rule, queues) 61 attached to them etc. etc. etc. 62 63 The goal of the routines in this file is to translate 64 information supplied by user in the form of handles 65 to more intelligible for kernel form, to make some sanity 66 checks and part of work, which is common to all qdiscs 67 and to provide rtnetlink notifications. 68 69 All real intelligent work is done inside qdisc modules. 70 71 72 73 Every discipline has two major routines: enqueue and dequeue. 74 75 ---dequeue 76 77 dequeue usually returns a skb to send. It is allowed to return NULL, 78 but it does not mean that queue is empty, it just means that 79 discipline does not want to send anything this time. 80 Queue is really empty if q->q.qlen == 0. 81 For complicated disciplines with multiple queues q->q is not 82 real packet queue, but however q->q.qlen must be valid. 83 84 ---enqueue 85 86 enqueue returns 0, if packet was enqueued successfully. 87 If packet (this one or another one) was dropped, it returns 88 not zero error code. 89 NET_XMIT_DROP - this packet dropped 90 Expected action: do not backoff, but wait until queue will clear. 91 NET_XMIT_CN - probably this packet enqueued, but another one dropped. 92 Expected action: backoff or ignore 93 94 Auxiliary routines: 95 96 ---peek 97 98 like dequeue but without removing a packet from the queue 99 100 ---reset 101 102 returns qdisc to initial state: purge all buffers, clear all 103 timers, counters (except for statistics) etc. 104 105 ---init 106 107 initializes newly created qdisc. 108 109 ---destroy 110 111 destroys resources allocated by init and during lifetime of qdisc. 112 113 ---change 114 115 changes qdisc parameters. 116 */ 117 118 /* Protects list of registered TC modules. It is pure SMP lock. */ 119 static DEFINE_RWLOCK(qdisc_mod_lock); 120 121 122 /************************************************ 123 * Queueing disciplines manipulation. * 124 ************************************************/ 125 126 127 /* The list of all installed queueing disciplines. */ 128 129 static struct Qdisc_ops *qdisc_base; 130 131 /* Register/unregister queueing discipline */ 132 133 int register_qdisc(struct Qdisc_ops *qops) 134 { 135 struct Qdisc_ops *q, **qp; 136 int rc = -EEXIST; 137 138 write_lock(&qdisc_mod_lock); 139 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next) 140 if (!strcmp(qops->id, q->id)) 141 goto out; 142 143 if (qops->enqueue == NULL) 144 qops->enqueue = noop_qdisc_ops.enqueue; 145 if (qops->peek == NULL) { 146 if (qops->dequeue == NULL) 147 qops->peek = noop_qdisc_ops.peek; 148 else 149 goto out_einval; 150 } 151 if (qops->dequeue == NULL) 152 qops->dequeue = noop_qdisc_ops.dequeue; 153 154 if (qops->cl_ops) { 155 const struct Qdisc_class_ops *cops = qops->cl_ops; 156 157 if (!(cops->find && cops->walk && cops->leaf)) 158 goto out_einval; 159 160 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf)) 161 goto out_einval; 162 } 163 164 qops->next = NULL; 165 *qp = qops; 166 rc = 0; 167 out: 168 write_unlock(&qdisc_mod_lock); 169 return rc; 170 171 out_einval: 172 rc = -EINVAL; 173 goto out; 174 } 175 EXPORT_SYMBOL(register_qdisc); 176 177 void unregister_qdisc(struct Qdisc_ops *qops) 178 { 179 struct Qdisc_ops *q, **qp; 180 int err = -ENOENT; 181 182 write_lock(&qdisc_mod_lock); 183 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next) 184 if (q == qops) 185 break; 186 if (q) { 187 *qp = q->next; 188 q->next = NULL; 189 err = 0; 190 } 191 write_unlock(&qdisc_mod_lock); 192 193 WARN(err, "unregister qdisc(%s) failed\n", qops->id); 194 } 195 EXPORT_SYMBOL(unregister_qdisc); 196 197 /* Get default qdisc if not otherwise specified */ 198 void qdisc_get_default(char *name, size_t len) 199 { 200 read_lock(&qdisc_mod_lock); 201 strscpy(name, default_qdisc_ops->id, len); 202 read_unlock(&qdisc_mod_lock); 203 } 204 205 static struct Qdisc_ops *qdisc_lookup_default(const char *name) 206 { 207 struct Qdisc_ops *q = NULL; 208 209 for (q = qdisc_base; q; q = q->next) { 210 if (!strcmp(name, q->id)) { 211 if (!bpf_try_module_get(q, q->owner)) 212 q = NULL; 213 break; 214 } 215 } 216 217 return q; 218 } 219 220 /* Set new default qdisc to use */ 221 int qdisc_set_default(const char *name) 222 { 223 const struct Qdisc_ops *ops; 224 225 if (!capable(CAP_NET_ADMIN)) 226 return -EPERM; 227 228 write_lock(&qdisc_mod_lock); 229 ops = qdisc_lookup_default(name); 230 if (!ops) { 231 /* Not found, drop lock and try to load module */ 232 write_unlock(&qdisc_mod_lock); 233 request_module(NET_SCH_ALIAS_PREFIX "%s", name); 234 write_lock(&qdisc_mod_lock); 235 236 ops = qdisc_lookup_default(name); 237 } 238 239 if (ops) { 240 /* Set new default */ 241 bpf_module_put(default_qdisc_ops, default_qdisc_ops->owner); 242 default_qdisc_ops = ops; 243 } 244 write_unlock(&qdisc_mod_lock); 245 246 return ops ? 0 : -ENOENT; 247 } 248 249 #ifdef CONFIG_NET_SCH_DEFAULT 250 /* Set default value from kernel config */ 251 static int __init sch_default_qdisc(void) 252 { 253 return qdisc_set_default(CONFIG_DEFAULT_NET_SCH); 254 } 255 late_initcall(sch_default_qdisc); 256 #endif 257 258 /* We know handle. Find qdisc among all qdisc's attached to device 259 * (root qdisc, all its children, children of children etc.) 260 * Note: caller either uses rtnl or rcu_read_lock() 261 */ 262 263 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) 264 { 265 struct Qdisc *q; 266 267 if (!qdisc_dev(root)) 268 return (root->handle == handle ? root : NULL); 269 270 if (!(root->flags & TCQ_F_BUILTIN) && 271 root->handle == handle) 272 return root; 273 274 hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle, 275 lockdep_rtnl_is_held()) { 276 if (q->handle == handle) 277 return q; 278 } 279 return NULL; 280 } 281 282 void qdisc_hash_add(struct Qdisc *q, bool invisible) 283 { 284 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { 285 ASSERT_RTNL(); 286 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle); 287 if (invisible) 288 q->flags |= TCQ_F_INVISIBLE; 289 } 290 } 291 EXPORT_SYMBOL(qdisc_hash_add); 292 293 void qdisc_hash_del(struct Qdisc *q) 294 { 295 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { 296 ASSERT_RTNL(); 297 hash_del_rcu(&q->hash); 298 } 299 } 300 EXPORT_SYMBOL(qdisc_hash_del); 301 302 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) 303 { 304 struct Qdisc *q; 305 306 if (!handle) 307 return NULL; 308 q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle); 309 if (q) 310 goto out; 311 312 if (dev_ingress_queue(dev)) 313 q = qdisc_match_from_root( 314 rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping), 315 handle); 316 out: 317 return q; 318 } 319 320 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle) 321 { 322 struct netdev_queue *nq; 323 struct Qdisc *q; 324 325 if (!handle) 326 return NULL; 327 q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle); 328 if (q) 329 goto out; 330 331 nq = dev_ingress_queue_rcu(dev); 332 if (nq) 333 q = qdisc_match_from_root(rcu_dereference(nq->qdisc_sleeping), 334 handle); 335 out: 336 return q; 337 } 338 339 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) 340 { 341 unsigned long cl; 342 const struct Qdisc_class_ops *cops = p->ops->cl_ops; 343 344 if (cops == NULL) 345 return NULL; 346 cl = cops->find(p, classid); 347 348 if (cl == 0) 349 return NULL; 350 return cops->leaf(p, cl); 351 } 352 353 /* Find queueing discipline by name */ 354 355 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind) 356 { 357 struct Qdisc_ops *q = NULL; 358 359 if (kind) { 360 read_lock(&qdisc_mod_lock); 361 for (q = qdisc_base; q; q = q->next) { 362 if (nla_strcmp(kind, q->id) == 0) { 363 if (!bpf_try_module_get(q, q->owner)) 364 q = NULL; 365 break; 366 } 367 } 368 read_unlock(&qdisc_mod_lock); 369 } 370 return q; 371 } 372 373 /* The linklayer setting were not transferred from iproute2, in older 374 * versions, and the rate tables lookup systems have been dropped in 375 * the kernel. To keep backward compatible with older iproute2 tc 376 * utils, we detect the linklayer setting by detecting if the rate 377 * table were modified. 378 * 379 * For linklayer ATM table entries, the rate table will be aligned to 380 * 48 bytes, thus some table entries will contain the same value. The 381 * mpu (min packet unit) is also encoded into the old rate table, thus 382 * starting from the mpu, we find low and high table entries for 383 * mapping this cell. If these entries contain the same value, when 384 * the rate tables have been modified for linklayer ATM. 385 * 386 * This is done by rounding mpu to the nearest 48 bytes cell/entry, 387 * and then roundup to the next cell, calc the table entry one below, 388 * and compare. 389 */ 390 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab) 391 { 392 int low = roundup(r->mpu, 48); 393 int high = roundup(low+1, 48); 394 int cell_low = low >> r->cell_log; 395 int cell_high = (high >> r->cell_log) - 1; 396 397 /* rtab is too inaccurate at rates > 100Mbit/s */ 398 if ((r->rate > (100000000/8)) || (rtab[0] == 0)) { 399 pr_debug("TC linklayer: Giving up ATM detection\n"); 400 return TC_LINKLAYER_ETHERNET; 401 } 402 403 if ((cell_high > cell_low) && (cell_high < 256) 404 && (rtab[cell_low] == rtab[cell_high])) { 405 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n", 406 cell_low, cell_high, rtab[cell_high]); 407 return TC_LINKLAYER_ATM; 408 } 409 return TC_LINKLAYER_ETHERNET; 410 } 411 412 static struct qdisc_rate_table *qdisc_rtab_list; 413 414 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, 415 struct nlattr *tab, 416 struct netlink_ext_ack *extack) 417 { 418 struct qdisc_rate_table *rtab; 419 420 if (tab == NULL || r->rate == 0 || 421 r->cell_log == 0 || r->cell_log >= 32 || 422 nla_len(tab) != TC_RTAB_SIZE) { 423 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching"); 424 return NULL; 425 } 426 427 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) { 428 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) && 429 !memcmp(&rtab->data, nla_data(tab), 1024)) { 430 rtab->refcnt++; 431 return rtab; 432 } 433 } 434 435 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL); 436 if (rtab) { 437 rtab->rate = *r; 438 rtab->refcnt = 1; 439 memcpy(rtab->data, nla_data(tab), 1024); 440 if (r->linklayer == TC_LINKLAYER_UNAWARE) 441 r->linklayer = __detect_linklayer(r, rtab->data); 442 rtab->next = qdisc_rtab_list; 443 qdisc_rtab_list = rtab; 444 } else { 445 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table"); 446 } 447 return rtab; 448 } 449 EXPORT_SYMBOL(qdisc_get_rtab); 450 451 void qdisc_put_rtab(struct qdisc_rate_table *tab) 452 { 453 struct qdisc_rate_table *rtab, **rtabp; 454 455 if (!tab || --tab->refcnt) 456 return; 457 458 for (rtabp = &qdisc_rtab_list; 459 (rtab = *rtabp) != NULL; 460 rtabp = &rtab->next) { 461 if (rtab == tab) { 462 *rtabp = rtab->next; 463 kfree(rtab); 464 return; 465 } 466 } 467 } 468 EXPORT_SYMBOL(qdisc_put_rtab); 469 470 static LIST_HEAD(qdisc_stab_list); 471 472 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = { 473 [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) }, 474 [TCA_STAB_DATA] = { .type = NLA_BINARY }, 475 }; 476 477 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt, 478 struct netlink_ext_ack *extack) 479 { 480 struct nlattr *tb[TCA_STAB_MAX + 1]; 481 struct qdisc_size_table *stab; 482 struct tc_sizespec *s; 483 unsigned int tsize = 0; 484 u16 *tab = NULL; 485 int err; 486 487 err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy, 488 extack); 489 if (err < 0) 490 return ERR_PTR(err); 491 if (!tb[TCA_STAB_BASE]) { 492 NL_SET_ERR_MSG(extack, "Size table base attribute is missing"); 493 return ERR_PTR(-EINVAL); 494 } 495 496 s = nla_data(tb[TCA_STAB_BASE]); 497 498 if (s->tsize > 0) { 499 if (!tb[TCA_STAB_DATA]) { 500 NL_SET_ERR_MSG(extack, "Size table data attribute is missing"); 501 return ERR_PTR(-EINVAL); 502 } 503 tab = nla_data(tb[TCA_STAB_DATA]); 504 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16); 505 } 506 507 if (tsize != s->tsize || (!tab && tsize > 0)) { 508 NL_SET_ERR_MSG(extack, "Invalid size of size table"); 509 return ERR_PTR(-EINVAL); 510 } 511 512 list_for_each_entry(stab, &qdisc_stab_list, list) { 513 if (memcmp(&stab->szopts, s, sizeof(*s))) 514 continue; 515 if (tsize > 0 && 516 memcmp(stab->data, tab, flex_array_size(stab, data, tsize))) 517 continue; 518 stab->refcnt++; 519 return stab; 520 } 521 522 if (s->size_log > STAB_SIZE_LOG_MAX || 523 s->cell_log > STAB_SIZE_LOG_MAX) { 524 NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table"); 525 return ERR_PTR(-EINVAL); 526 } 527 528 stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL); 529 if (!stab) 530 return ERR_PTR(-ENOMEM); 531 532 stab->refcnt = 1; 533 stab->szopts = *s; 534 if (tsize > 0) 535 memcpy(stab->data, tab, flex_array_size(stab, data, tsize)); 536 537 list_add_tail(&stab->list, &qdisc_stab_list); 538 539 return stab; 540 } 541 542 void qdisc_put_stab(struct qdisc_size_table *tab) 543 { 544 if (!tab) 545 return; 546 547 if (--tab->refcnt == 0) { 548 list_del(&tab->list); 549 kfree_rcu(tab, rcu); 550 } 551 } 552 EXPORT_SYMBOL(qdisc_put_stab); 553 554 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab) 555 { 556 struct nlattr *nest; 557 558 nest = nla_nest_start_noflag(skb, TCA_STAB); 559 if (nest == NULL) 560 goto nla_put_failure; 561 if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts)) 562 goto nla_put_failure; 563 nla_nest_end(skb, nest); 564 565 return skb->len; 566 567 nla_put_failure: 568 return -1; 569 } 570 571 void __qdisc_calculate_pkt_len(struct sk_buff *skb, 572 const struct qdisc_size_table *stab) 573 { 574 int pkt_len, slot; 575 576 pkt_len = skb->len + stab->szopts.overhead; 577 if (unlikely(!stab->szopts.tsize)) 578 goto out; 579 580 slot = pkt_len + stab->szopts.cell_align; 581 if (unlikely(slot < 0)) 582 slot = 0; 583 584 slot >>= stab->szopts.cell_log; 585 if (likely(slot < stab->szopts.tsize)) 586 pkt_len = stab->data[slot]; 587 else 588 pkt_len = stab->data[stab->szopts.tsize - 1] * 589 (slot / stab->szopts.tsize) + 590 stab->data[slot % stab->szopts.tsize]; 591 592 pkt_len <<= stab->szopts.size_log; 593 out: 594 if (unlikely(pkt_len < 1)) 595 pkt_len = 1; 596 qdisc_skb_cb(skb)->pkt_len = pkt_len; 597 } 598 599 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc) 600 { 601 if (!(qdisc->flags & TCQ_F_WARN_NONWC)) { 602 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n", 603 txt, qdisc->ops->id, qdisc->handle >> 16); 604 qdisc->flags |= TCQ_F_WARN_NONWC; 605 } 606 } 607 EXPORT_SYMBOL(qdisc_warn_nonwc); 608 609 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) 610 { 611 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog, 612 timer); 613 614 rcu_read_lock(); 615 __netif_schedule(qdisc_root(wd->qdisc)); 616 rcu_read_unlock(); 617 618 return HRTIMER_NORESTART; 619 } 620 621 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, 622 clockid_t clockid) 623 { 624 hrtimer_setup(&wd->timer, qdisc_watchdog, clockid, HRTIMER_MODE_ABS_PINNED); 625 wd->qdisc = qdisc; 626 } 627 EXPORT_SYMBOL(qdisc_watchdog_init_clockid); 628 629 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) 630 { 631 qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC); 632 } 633 EXPORT_SYMBOL(qdisc_watchdog_init); 634 635 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires, 636 u64 delta_ns) 637 { 638 bool deactivated; 639 640 rcu_read_lock(); 641 deactivated = test_bit(__QDISC_STATE_DEACTIVATED, 642 &qdisc_root_sleeping(wd->qdisc)->state); 643 rcu_read_unlock(); 644 if (deactivated) 645 return; 646 647 if (hrtimer_is_queued(&wd->timer)) { 648 u64 softexpires; 649 650 softexpires = ktime_to_ns(hrtimer_get_softexpires(&wd->timer)); 651 /* If timer is already set in [expires, expires + delta_ns], 652 * do not reprogram it. 653 */ 654 if (softexpires - expires <= delta_ns) 655 return; 656 } 657 658 hrtimer_start_range_ns(&wd->timer, 659 ns_to_ktime(expires), 660 delta_ns, 661 HRTIMER_MODE_ABS_PINNED); 662 } 663 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns); 664 665 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd) 666 { 667 hrtimer_cancel(&wd->timer); 668 } 669 EXPORT_SYMBOL(qdisc_watchdog_cancel); 670 671 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n) 672 { 673 struct hlist_head *h; 674 unsigned int i; 675 676 h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL); 677 678 if (h != NULL) { 679 for (i = 0; i < n; i++) 680 INIT_HLIST_HEAD(&h[i]); 681 } 682 return h; 683 } 684 685 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash) 686 { 687 struct Qdisc_class_common *cl; 688 struct hlist_node *next; 689 struct hlist_head *nhash, *ohash; 690 unsigned int nsize, nmask, osize; 691 unsigned int i, h; 692 693 /* Rehash when load factor exceeds 0.75 */ 694 if (clhash->hashelems * 4 <= clhash->hashsize * 3) 695 return; 696 nsize = clhash->hashsize * 2; 697 nmask = nsize - 1; 698 nhash = qdisc_class_hash_alloc(nsize); 699 if (nhash == NULL) 700 return; 701 702 ohash = clhash->hash; 703 osize = clhash->hashsize; 704 705 sch_tree_lock(sch); 706 for (i = 0; i < osize; i++) { 707 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) { 708 h = qdisc_class_hash(cl->classid, nmask); 709 hlist_add_head(&cl->hnode, &nhash[h]); 710 } 711 } 712 clhash->hash = nhash; 713 clhash->hashsize = nsize; 714 clhash->hashmask = nmask; 715 sch_tree_unlock(sch); 716 717 kvfree(ohash); 718 } 719 EXPORT_SYMBOL(qdisc_class_hash_grow); 720 721 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash) 722 { 723 unsigned int size = 4; 724 725 clhash->hash = qdisc_class_hash_alloc(size); 726 if (!clhash->hash) 727 return -ENOMEM; 728 clhash->hashsize = size; 729 clhash->hashmask = size - 1; 730 clhash->hashelems = 0; 731 return 0; 732 } 733 EXPORT_SYMBOL(qdisc_class_hash_init); 734 735 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash) 736 { 737 kvfree(clhash->hash); 738 } 739 EXPORT_SYMBOL(qdisc_class_hash_destroy); 740 741 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash, 742 struct Qdisc_class_common *cl) 743 { 744 unsigned int h; 745 746 INIT_HLIST_NODE(&cl->hnode); 747 h = qdisc_class_hash(cl->classid, clhash->hashmask); 748 hlist_add_head(&cl->hnode, &clhash->hash[h]); 749 clhash->hashelems++; 750 } 751 EXPORT_SYMBOL(qdisc_class_hash_insert); 752 753 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash, 754 struct Qdisc_class_common *cl) 755 { 756 hlist_del(&cl->hnode); 757 clhash->hashelems--; 758 } 759 EXPORT_SYMBOL(qdisc_class_hash_remove); 760 761 /* Allocate an unique handle from space managed by kernel 762 * Possible range is [8000-FFFF]:0000 (0x8000 values) 763 */ 764 static u32 qdisc_alloc_handle(struct net_device *dev) 765 { 766 int i = 0x8000; 767 static u32 autohandle = TC_H_MAKE(0x80000000U, 0); 768 769 do { 770 autohandle += TC_H_MAKE(0x10000U, 0); 771 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0)) 772 autohandle = TC_H_MAKE(0x80000000U, 0); 773 if (!qdisc_lookup(dev, autohandle)) 774 return autohandle; 775 cond_resched(); 776 } while (--i > 0); 777 778 return 0; 779 } 780 781 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len) 782 { 783 const struct Qdisc_class_ops *cops; 784 unsigned long cl; 785 u32 parentid; 786 bool notify; 787 int drops; 788 789 drops = max_t(int, n, 0); 790 rcu_read_lock(); 791 while ((parentid = sch->parent)) { 792 if (parentid == TC_H_ROOT) 793 break; 794 795 if (sch->flags & TCQ_F_NOPARENT) 796 break; 797 /* Notify parent qdisc only if child qdisc becomes empty. */ 798 notify = !sch->q.qlen; 799 /* TODO: perform the search on a per txq basis */ 800 sch = qdisc_lookup_rcu(qdisc_dev(sch), TC_H_MAJ(parentid)); 801 if (sch == NULL) { 802 WARN_ON_ONCE(parentid != TC_H_ROOT); 803 break; 804 } 805 cops = sch->ops->cl_ops; 806 if (notify && cops->qlen_notify) { 807 /* Note that qlen_notify must be idempotent as it may get called 808 * multiple times. 809 */ 810 cl = cops->find(sch, parentid); 811 cops->qlen_notify(sch, cl); 812 } 813 sch->q.qlen -= n; 814 sch->qstats.backlog -= len; 815 __qdisc_qstats_drop(sch, drops); 816 } 817 rcu_read_unlock(); 818 } 819 EXPORT_SYMBOL(qdisc_tree_reduce_backlog); 820 821 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type, 822 void *type_data) 823 { 824 struct net_device *dev = qdisc_dev(sch); 825 int err; 826 827 sch->flags &= ~TCQ_F_OFFLOADED; 828 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) 829 return 0; 830 831 err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data); 832 if (err == -EOPNOTSUPP) 833 return 0; 834 835 if (!err) 836 sch->flags |= TCQ_F_OFFLOADED; 837 838 return err; 839 } 840 EXPORT_SYMBOL(qdisc_offload_dump_helper); 841 842 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch, 843 struct Qdisc *new, struct Qdisc *old, 844 enum tc_setup_type type, void *type_data, 845 struct netlink_ext_ack *extack) 846 { 847 bool any_qdisc_is_offloaded; 848 int err; 849 850 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) 851 return; 852 853 err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data); 854 855 /* Don't report error if the graft is part of destroy operation. */ 856 if (!err || !new || new == &noop_qdisc) 857 return; 858 859 /* Don't report error if the parent, the old child and the new 860 * one are not offloaded. 861 */ 862 any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED; 863 any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED; 864 any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED; 865 866 if (any_qdisc_is_offloaded) 867 NL_SET_ERR_MSG(extack, "Offloading graft operation failed."); 868 } 869 EXPORT_SYMBOL(qdisc_offload_graft_helper); 870 871 void qdisc_offload_query_caps(struct net_device *dev, 872 enum tc_setup_type type, 873 void *caps, size_t caps_len) 874 { 875 const struct net_device_ops *ops = dev->netdev_ops; 876 struct tc_query_caps_base base = { 877 .type = type, 878 .caps = caps, 879 }; 880 881 memset(caps, 0, caps_len); 882 883 if (ops->ndo_setup_tc) 884 ops->ndo_setup_tc(dev, TC_QUERY_CAPS, &base); 885 } 886 EXPORT_SYMBOL(qdisc_offload_query_caps); 887 888 static void qdisc_offload_graft_root(struct net_device *dev, 889 struct Qdisc *new, struct Qdisc *old, 890 struct netlink_ext_ack *extack) 891 { 892 struct tc_root_qopt_offload graft_offload = { 893 .command = TC_ROOT_GRAFT, 894 .handle = new ? new->handle : 0, 895 .ingress = (new && new->flags & TCQ_F_INGRESS) || 896 (old && old->flags & TCQ_F_INGRESS), 897 }; 898 899 qdisc_offload_graft_helper(dev, NULL, new, old, 900 TC_SETUP_ROOT_QDISC, &graft_offload, extack); 901 } 902 903 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, 904 u32 portid, u32 seq, u16 flags, int event, 905 struct netlink_ext_ack *extack) 906 { 907 struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL; 908 struct gnet_stats_queue __percpu *cpu_qstats = NULL; 909 struct tcmsg *tcm; 910 struct nlmsghdr *nlh; 911 unsigned char *b = skb_tail_pointer(skb); 912 struct gnet_dump d; 913 struct qdisc_size_table *stab; 914 u32 block_index; 915 __u32 qlen; 916 917 cond_resched(); 918 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); 919 if (!nlh) 920 goto out_nlmsg_trim; 921 tcm = nlmsg_data(nlh); 922 tcm->tcm_family = AF_UNSPEC; 923 tcm->tcm__pad1 = 0; 924 tcm->tcm__pad2 = 0; 925 tcm->tcm_ifindex = qdisc_dev(q)->ifindex; 926 tcm->tcm_parent = clid; 927 tcm->tcm_handle = q->handle; 928 tcm->tcm_info = refcount_read(&q->refcnt); 929 if (nla_put_string(skb, TCA_KIND, q->ops->id)) 930 goto nla_put_failure; 931 if (q->ops->ingress_block_get) { 932 block_index = q->ops->ingress_block_get(q); 933 if (block_index && 934 nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index)) 935 goto nla_put_failure; 936 } 937 if (q->ops->egress_block_get) { 938 block_index = q->ops->egress_block_get(q); 939 if (block_index && 940 nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index)) 941 goto nla_put_failure; 942 } 943 if (q->ops->dump && q->ops->dump(q, skb) < 0) 944 goto nla_put_failure; 945 if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED))) 946 goto nla_put_failure; 947 qlen = qdisc_qlen_sum(q); 948 949 stab = rtnl_dereference(q->stab); 950 if (stab && qdisc_dump_stab(skb, stab) < 0) 951 goto nla_put_failure; 952 953 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, 954 NULL, &d, TCA_PAD) < 0) 955 goto nla_put_failure; 956 957 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) 958 goto nla_put_failure; 959 960 if (qdisc_is_percpu_stats(q)) { 961 cpu_bstats = q->cpu_bstats; 962 cpu_qstats = q->cpu_qstats; 963 } 964 965 if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 || 966 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 || 967 gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0) 968 goto nla_put_failure; 969 970 if (gnet_stats_finish_copy(&d) < 0) 971 goto nla_put_failure; 972 973 if (extack && extack->_msg && 974 nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) 975 goto out_nlmsg_trim; 976 977 nlh->nlmsg_len = skb_tail_pointer(skb) - b; 978 979 return skb->len; 980 981 out_nlmsg_trim: 982 nla_put_failure: 983 nlmsg_trim(skb, b); 984 return -1; 985 } 986 987 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible) 988 { 989 if (q->flags & TCQ_F_BUILTIN) 990 return true; 991 if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible) 992 return true; 993 994 return false; 995 } 996 997 static int qdisc_get_notify(struct net *net, struct sk_buff *oskb, 998 struct nlmsghdr *n, u32 clid, struct Qdisc *q, 999 struct netlink_ext_ack *extack) 1000 { 1001 struct sk_buff *skb; 1002 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; 1003 1004 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 1005 if (!skb) 1006 return -ENOBUFS; 1007 1008 if (!tc_qdisc_dump_ignore(q, false)) { 1009 if (tc_fill_qdisc(skb, q, clid, portid, n->nlmsg_seq, 0, 1010 RTM_NEWQDISC, extack) < 0) 1011 goto err_out; 1012 } 1013 1014 if (skb->len) 1015 return rtnetlink_send(skb, net, portid, RTNLGRP_TC, 1016 n->nlmsg_flags & NLM_F_ECHO); 1017 1018 err_out: 1019 kfree_skb(skb); 1020 return -EINVAL; 1021 } 1022 1023 static int qdisc_notify(struct net *net, struct sk_buff *oskb, 1024 struct nlmsghdr *n, u32 clid, 1025 struct Qdisc *old, struct Qdisc *new, 1026 struct netlink_ext_ack *extack) 1027 { 1028 struct sk_buff *skb; 1029 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; 1030 1031 if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) 1032 return 0; 1033 1034 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 1035 if (!skb) 1036 return -ENOBUFS; 1037 1038 if (old && !tc_qdisc_dump_ignore(old, false)) { 1039 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq, 1040 0, RTM_DELQDISC, extack) < 0) 1041 goto err_out; 1042 } 1043 if (new && !tc_qdisc_dump_ignore(new, false)) { 1044 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq, 1045 old ? NLM_F_REPLACE : 0, RTM_NEWQDISC, extack) < 0) 1046 goto err_out; 1047 } 1048 1049 if (skb->len) 1050 return rtnetlink_send(skb, net, portid, RTNLGRP_TC, 1051 n->nlmsg_flags & NLM_F_ECHO); 1052 1053 err_out: 1054 kfree_skb(skb); 1055 return -EINVAL; 1056 } 1057 1058 static void notify_and_destroy(struct net *net, struct sk_buff *skb, 1059 struct nlmsghdr *n, u32 clid, 1060 struct Qdisc *old, struct Qdisc *new, 1061 struct netlink_ext_ack *extack) 1062 { 1063 if (new || old) 1064 qdisc_notify(net, skb, n, clid, old, new, extack); 1065 1066 if (old) 1067 qdisc_put(old); 1068 } 1069 1070 static void qdisc_clear_nolock(struct Qdisc *sch) 1071 { 1072 sch->flags &= ~TCQ_F_NOLOCK; 1073 if (!(sch->flags & TCQ_F_CPUSTATS)) 1074 return; 1075 1076 free_percpu(sch->cpu_bstats); 1077 free_percpu(sch->cpu_qstats); 1078 sch->cpu_bstats = NULL; 1079 sch->cpu_qstats = NULL; 1080 sch->flags &= ~TCQ_F_CPUSTATS; 1081 } 1082 1083 /* Graft qdisc "new" to class "classid" of qdisc "parent" or 1084 * to device "dev". 1085 * 1086 * When appropriate send a netlink notification using 'skb' 1087 * and "n". 1088 * 1089 * On success, destroy old qdisc. 1090 */ 1091 1092 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, 1093 struct sk_buff *skb, struct nlmsghdr *n, u32 classid, 1094 struct Qdisc *new, struct Qdisc *old, 1095 struct netlink_ext_ack *extack) 1096 { 1097 struct Qdisc *q = old; 1098 struct net *net = dev_net(dev); 1099 1100 if (parent == NULL) { 1101 unsigned int i, num_q, ingress; 1102 struct netdev_queue *dev_queue; 1103 1104 ingress = 0; 1105 num_q = dev->num_tx_queues; 1106 if ((q && q->flags & TCQ_F_INGRESS) || 1107 (new && new->flags & TCQ_F_INGRESS)) { 1108 ingress = 1; 1109 dev_queue = dev_ingress_queue(dev); 1110 if (!dev_queue) { 1111 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue"); 1112 return -ENOENT; 1113 } 1114 1115 q = rtnl_dereference(dev_queue->qdisc_sleeping); 1116 1117 /* This is the counterpart of that qdisc_refcount_inc_nz() call in 1118 * __tcf_qdisc_find() for filter requests. 1119 */ 1120 if (!qdisc_refcount_dec_if_one(q)) { 1121 NL_SET_ERR_MSG(extack, 1122 "Current ingress or clsact Qdisc has ongoing filter requests"); 1123 return -EBUSY; 1124 } 1125 } 1126 1127 if (dev->flags & IFF_UP) 1128 dev_deactivate(dev); 1129 1130 qdisc_offload_graft_root(dev, new, old, extack); 1131 1132 if (new && new->ops->attach && !ingress) 1133 goto skip; 1134 1135 if (!ingress) { 1136 for (i = 0; i < num_q; i++) { 1137 dev_queue = netdev_get_tx_queue(dev, i); 1138 old = dev_graft_qdisc(dev_queue, new); 1139 1140 if (new && i > 0) 1141 qdisc_refcount_inc(new); 1142 qdisc_put(old); 1143 } 1144 } else { 1145 old = dev_graft_qdisc(dev_queue, NULL); 1146 1147 /* {ingress,clsact}_destroy() @old before grafting @new to avoid 1148 * unprotected concurrent accesses to net_device::miniq_{in,e}gress 1149 * pointer(s) in mini_qdisc_pair_swap(). 1150 */ 1151 qdisc_notify(net, skb, n, classid, old, new, extack); 1152 qdisc_destroy(old); 1153 1154 dev_graft_qdisc(dev_queue, new); 1155 } 1156 1157 skip: 1158 if (!ingress) { 1159 old = rtnl_dereference(dev->qdisc); 1160 if (new && !new->ops->attach) 1161 qdisc_refcount_inc(new); 1162 rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc); 1163 1164 notify_and_destroy(net, skb, n, classid, old, new, extack); 1165 1166 if (new && new->ops->attach) 1167 new->ops->attach(new); 1168 } 1169 1170 if (dev->flags & IFF_UP) 1171 dev_activate(dev); 1172 } else { 1173 const struct Qdisc_class_ops *cops = parent->ops->cl_ops; 1174 unsigned long cl; 1175 int err; 1176 1177 /* Only support running class lockless if parent is lockless */ 1178 if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK)) 1179 qdisc_clear_nolock(new); 1180 1181 if (!cops || !cops->graft) 1182 return -EOPNOTSUPP; 1183 1184 cl = cops->find(parent, classid); 1185 if (!cl) { 1186 NL_SET_ERR_MSG(extack, "Specified class not found"); 1187 return -ENOENT; 1188 } 1189 1190 if (new && new->ops == &noqueue_qdisc_ops) { 1191 NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class"); 1192 return -EINVAL; 1193 } 1194 1195 if (new && 1196 !(parent->flags & TCQ_F_MQROOT) && 1197 rcu_access_pointer(new->stab)) { 1198 NL_SET_ERR_MSG(extack, "STAB not supported on a non root"); 1199 return -EINVAL; 1200 } 1201 err = cops->graft(parent, cl, new, &old, extack); 1202 if (err) 1203 return err; 1204 notify_and_destroy(net, skb, n, classid, old, new, extack); 1205 } 1206 return 0; 1207 } 1208 1209 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca, 1210 struct netlink_ext_ack *extack) 1211 { 1212 u32 block_index; 1213 1214 if (tca[TCA_INGRESS_BLOCK]) { 1215 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]); 1216 1217 if (!block_index) { 1218 NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0"); 1219 return -EINVAL; 1220 } 1221 if (!sch->ops->ingress_block_set) { 1222 NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported"); 1223 return -EOPNOTSUPP; 1224 } 1225 sch->ops->ingress_block_set(sch, block_index); 1226 } 1227 if (tca[TCA_EGRESS_BLOCK]) { 1228 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]); 1229 1230 if (!block_index) { 1231 NL_SET_ERR_MSG(extack, "Egress block index cannot be 0"); 1232 return -EINVAL; 1233 } 1234 if (!sch->ops->egress_block_set) { 1235 NL_SET_ERR_MSG(extack, "Egress block sharing is not supported"); 1236 return -EOPNOTSUPP; 1237 } 1238 sch->ops->egress_block_set(sch, block_index); 1239 } 1240 return 0; 1241 } 1242 1243 /* 1244 Allocate and initialize new qdisc. 1245 1246 Parameters are passed via opt. 1247 */ 1248 1249 static struct Qdisc *qdisc_create(struct net_device *dev, 1250 struct netdev_queue *dev_queue, 1251 u32 parent, u32 handle, 1252 struct nlattr **tca, int *errp, 1253 struct netlink_ext_ack *extack) 1254 { 1255 int err; 1256 struct nlattr *kind = tca[TCA_KIND]; 1257 struct Qdisc *sch; 1258 struct Qdisc_ops *ops; 1259 struct qdisc_size_table *stab; 1260 1261 ops = qdisc_lookup_ops(kind); 1262 if (!ops) { 1263 err = -ENOENT; 1264 NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown"); 1265 goto err_out; 1266 } 1267 1268 sch = qdisc_alloc(dev_queue, ops, extack); 1269 if (IS_ERR(sch)) { 1270 err = PTR_ERR(sch); 1271 goto err_out2; 1272 } 1273 1274 sch->parent = parent; 1275 1276 if (handle == TC_H_INGRESS) { 1277 if (!(sch->flags & TCQ_F_INGRESS)) { 1278 NL_SET_ERR_MSG(extack, 1279 "Specified parent ID is reserved for ingress and clsact Qdiscs"); 1280 err = -EINVAL; 1281 goto err_out3; 1282 } 1283 handle = TC_H_MAKE(TC_H_INGRESS, 0); 1284 } else { 1285 if (handle == 0) { 1286 handle = qdisc_alloc_handle(dev); 1287 if (handle == 0) { 1288 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded"); 1289 err = -ENOSPC; 1290 goto err_out3; 1291 } 1292 } 1293 if (!netif_is_multiqueue(dev)) 1294 sch->flags |= TCQ_F_ONETXQUEUE; 1295 } 1296 1297 sch->handle = handle; 1298 1299 /* This exist to keep backward compatible with a userspace 1300 * loophole, what allowed userspace to get IFF_NO_QUEUE 1301 * facility on older kernels by setting tx_queue_len=0 (prior 1302 * to qdisc init), and then forgot to reinit tx_queue_len 1303 * before again attaching a qdisc. 1304 */ 1305 if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) { 1306 WRITE_ONCE(dev->tx_queue_len, DEFAULT_TX_QUEUE_LEN); 1307 netdev_info(dev, "Caught tx_queue_len zero misconfig\n"); 1308 } 1309 1310 err = qdisc_block_indexes_set(sch, tca, extack); 1311 if (err) 1312 goto err_out3; 1313 1314 if (tca[TCA_STAB]) { 1315 stab = qdisc_get_stab(tca[TCA_STAB], extack); 1316 if (IS_ERR(stab)) { 1317 err = PTR_ERR(stab); 1318 goto err_out3; 1319 } 1320 rcu_assign_pointer(sch->stab, stab); 1321 } 1322 1323 if (ops->init) { 1324 err = ops->init(sch, tca[TCA_OPTIONS], extack); 1325 if (err != 0) 1326 goto err_out4; 1327 } 1328 1329 if (tca[TCA_RATE]) { 1330 err = -EOPNOTSUPP; 1331 if (sch->flags & TCQ_F_MQROOT) { 1332 NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc"); 1333 goto err_out4; 1334 } 1335 1336 err = gen_new_estimator(&sch->bstats, 1337 sch->cpu_bstats, 1338 &sch->rate_est, 1339 NULL, 1340 true, 1341 tca[TCA_RATE]); 1342 if (err) { 1343 NL_SET_ERR_MSG(extack, "Failed to generate new estimator"); 1344 goto err_out4; 1345 } 1346 } 1347 1348 qdisc_hash_add(sch, false); 1349 trace_qdisc_create(ops, dev, parent); 1350 1351 return sch; 1352 1353 err_out4: 1354 /* Even if ops->init() failed, we call ops->destroy() 1355 * like qdisc_create_dflt(). 1356 */ 1357 if (ops->destroy) 1358 ops->destroy(sch); 1359 qdisc_put_stab(rtnl_dereference(sch->stab)); 1360 err_out3: 1361 lockdep_unregister_key(&sch->root_lock_key); 1362 netdev_put(dev, &sch->dev_tracker); 1363 qdisc_free(sch); 1364 err_out2: 1365 bpf_module_put(ops, ops->owner); 1366 err_out: 1367 *errp = err; 1368 return NULL; 1369 } 1370 1371 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca, 1372 struct netlink_ext_ack *extack) 1373 { 1374 struct qdisc_size_table *ostab, *stab = NULL; 1375 int err = 0; 1376 1377 if (tca[TCA_OPTIONS]) { 1378 if (!sch->ops->change) { 1379 NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc"); 1380 return -EINVAL; 1381 } 1382 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) { 1383 NL_SET_ERR_MSG(extack, "Change of blocks is not supported"); 1384 return -EOPNOTSUPP; 1385 } 1386 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack); 1387 if (err) 1388 return err; 1389 } 1390 1391 if (tca[TCA_STAB]) { 1392 stab = qdisc_get_stab(tca[TCA_STAB], extack); 1393 if (IS_ERR(stab)) 1394 return PTR_ERR(stab); 1395 } 1396 1397 ostab = rtnl_dereference(sch->stab); 1398 rcu_assign_pointer(sch->stab, stab); 1399 qdisc_put_stab(ostab); 1400 1401 if (tca[TCA_RATE]) { 1402 /* NB: ignores errors from replace_estimator 1403 because change can't be undone. */ 1404 if (sch->flags & TCQ_F_MQROOT) 1405 goto out; 1406 gen_replace_estimator(&sch->bstats, 1407 sch->cpu_bstats, 1408 &sch->rate_est, 1409 NULL, 1410 true, 1411 tca[TCA_RATE]); 1412 } 1413 out: 1414 return 0; 1415 } 1416 1417 struct check_loop_arg { 1418 struct qdisc_walker w; 1419 struct Qdisc *p; 1420 int depth; 1421 }; 1422 1423 static int check_loop_fn(struct Qdisc *q, unsigned long cl, 1424 struct qdisc_walker *w); 1425 1426 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth) 1427 { 1428 struct check_loop_arg arg; 1429 1430 if (q->ops->cl_ops == NULL) 1431 return 0; 1432 1433 arg.w.stop = arg.w.skip = arg.w.count = 0; 1434 arg.w.fn = check_loop_fn; 1435 arg.depth = depth; 1436 arg.p = p; 1437 q->ops->cl_ops->walk(q, &arg.w); 1438 return arg.w.stop ? -ELOOP : 0; 1439 } 1440 1441 static int 1442 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) 1443 { 1444 struct Qdisc *leaf; 1445 const struct Qdisc_class_ops *cops = q->ops->cl_ops; 1446 struct check_loop_arg *arg = (struct check_loop_arg *)w; 1447 1448 leaf = cops->leaf(q, cl); 1449 if (leaf) { 1450 if (leaf == arg->p || arg->depth > 7) 1451 return -ELOOP; 1452 return check_loop(leaf, arg->p, arg->depth + 1); 1453 } 1454 return 0; 1455 } 1456 1457 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = { 1458 [TCA_KIND] = { .type = NLA_STRING }, 1459 [TCA_RATE] = { .type = NLA_BINARY, 1460 .len = sizeof(struct tc_estimator) }, 1461 [TCA_STAB] = { .type = NLA_NESTED }, 1462 [TCA_DUMP_INVISIBLE] = { .type = NLA_FLAG }, 1463 [TCA_CHAIN] = { .type = NLA_U32 }, 1464 [TCA_INGRESS_BLOCK] = { .type = NLA_U32 }, 1465 [TCA_EGRESS_BLOCK] = { .type = NLA_U32 }, 1466 }; 1467 1468 /* 1469 * Delete/get qdisc. 1470 */ 1471 1472 static int __tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, 1473 struct netlink_ext_ack *extack, 1474 struct net_device *dev, 1475 struct nlattr *tca[TCA_MAX + 1], 1476 struct tcmsg *tcm) 1477 { 1478 struct net *net = sock_net(skb->sk); 1479 struct Qdisc *q = NULL; 1480 struct Qdisc *p = NULL; 1481 u32 clid; 1482 int err; 1483 1484 clid = tcm->tcm_parent; 1485 if (clid) { 1486 if (clid != TC_H_ROOT) { 1487 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) { 1488 p = qdisc_lookup(dev, TC_H_MAJ(clid)); 1489 if (!p) { 1490 NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid"); 1491 return -ENOENT; 1492 } 1493 q = qdisc_leaf(p, clid); 1494 } else if (dev_ingress_queue(dev)) { 1495 q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping); 1496 } 1497 } else { 1498 q = rtnl_dereference(dev->qdisc); 1499 } 1500 if (!q) { 1501 NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device"); 1502 return -ENOENT; 1503 } 1504 1505 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) { 1506 NL_SET_ERR_MSG(extack, "Invalid handle"); 1507 return -EINVAL; 1508 } 1509 } else { 1510 q = qdisc_lookup(dev, tcm->tcm_handle); 1511 if (!q) { 1512 NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle"); 1513 return -ENOENT; 1514 } 1515 } 1516 1517 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { 1518 NL_SET_ERR_MSG(extack, "Invalid qdisc name: must match existing qdisc"); 1519 return -EINVAL; 1520 } 1521 1522 if (n->nlmsg_type == RTM_DELQDISC) { 1523 if (!clid) { 1524 NL_SET_ERR_MSG(extack, "Classid cannot be zero"); 1525 return -EINVAL; 1526 } 1527 if (q->handle == 0) { 1528 NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero"); 1529 return -ENOENT; 1530 } 1531 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack); 1532 if (err != 0) 1533 return err; 1534 } else { 1535 qdisc_get_notify(net, skb, n, clid, q, NULL); 1536 } 1537 return 0; 1538 } 1539 1540 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, 1541 struct netlink_ext_ack *extack) 1542 { 1543 struct net *net = sock_net(skb->sk); 1544 struct tcmsg *tcm = nlmsg_data(n); 1545 struct nlattr *tca[TCA_MAX + 1]; 1546 struct net_device *dev; 1547 int err; 1548 1549 err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, 1550 rtm_tca_policy, extack); 1551 if (err < 0) 1552 return err; 1553 1554 dev = __dev_get_by_index(net, tcm->tcm_ifindex); 1555 if (!dev) 1556 return -ENODEV; 1557 1558 netdev_lock_ops(dev); 1559 err = __tc_get_qdisc(skb, n, extack, dev, tca, tcm); 1560 netdev_unlock_ops(dev); 1561 1562 return err; 1563 } 1564 1565 static bool req_create_or_replace(struct nlmsghdr *n) 1566 { 1567 return (n->nlmsg_flags & NLM_F_CREATE && 1568 n->nlmsg_flags & NLM_F_REPLACE); 1569 } 1570 1571 static bool req_create_exclusive(struct nlmsghdr *n) 1572 { 1573 return (n->nlmsg_flags & NLM_F_CREATE && 1574 n->nlmsg_flags & NLM_F_EXCL); 1575 } 1576 1577 static bool req_change(struct nlmsghdr *n) 1578 { 1579 return (!(n->nlmsg_flags & NLM_F_CREATE) && 1580 !(n->nlmsg_flags & NLM_F_REPLACE) && 1581 !(n->nlmsg_flags & NLM_F_EXCL)); 1582 } 1583 1584 static int __tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, 1585 struct netlink_ext_ack *extack, 1586 struct net_device *dev, 1587 struct nlattr *tca[TCA_MAX + 1], 1588 struct tcmsg *tcm) 1589 { 1590 struct Qdisc *q = NULL; 1591 struct Qdisc *p = NULL; 1592 u32 clid; 1593 int err; 1594 1595 clid = tcm->tcm_parent; 1596 1597 if (clid) { 1598 if (clid != TC_H_ROOT) { 1599 if (clid != TC_H_INGRESS) { 1600 p = qdisc_lookup(dev, TC_H_MAJ(clid)); 1601 if (!p) { 1602 NL_SET_ERR_MSG(extack, "Failed to find specified qdisc"); 1603 return -ENOENT; 1604 } 1605 q = qdisc_leaf(p, clid); 1606 } else if (dev_ingress_queue_create(dev)) { 1607 q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping); 1608 } 1609 } else { 1610 q = rtnl_dereference(dev->qdisc); 1611 } 1612 1613 /* It may be default qdisc, ignore it */ 1614 if (q && q->handle == 0) 1615 q = NULL; 1616 1617 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) { 1618 if (tcm->tcm_handle) { 1619 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) { 1620 NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override"); 1621 return -EEXIST; 1622 } 1623 if (TC_H_MIN(tcm->tcm_handle)) { 1624 NL_SET_ERR_MSG(extack, "Invalid minor handle"); 1625 return -EINVAL; 1626 } 1627 q = qdisc_lookup(dev, tcm->tcm_handle); 1628 if (!q) 1629 goto create_n_graft; 1630 if (q->parent != tcm->tcm_parent) { 1631 NL_SET_ERR_MSG(extack, "Cannot move an existing qdisc to a different parent"); 1632 return -EINVAL; 1633 } 1634 if (n->nlmsg_flags & NLM_F_EXCL) { 1635 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override"); 1636 return -EEXIST; 1637 } 1638 if (tca[TCA_KIND] && 1639 nla_strcmp(tca[TCA_KIND], q->ops->id)) { 1640 NL_SET_ERR_MSG(extack, "Invalid qdisc name: must match existing qdisc"); 1641 return -EINVAL; 1642 } 1643 if (q->flags & TCQ_F_INGRESS) { 1644 NL_SET_ERR_MSG(extack, 1645 "Cannot regraft ingress or clsact Qdiscs"); 1646 return -EINVAL; 1647 } 1648 if (q == p || 1649 (p && check_loop(q, p, 0))) { 1650 NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected"); 1651 return -ELOOP; 1652 } 1653 if (clid == TC_H_INGRESS) { 1654 NL_SET_ERR_MSG(extack, "Ingress cannot graft directly"); 1655 return -EINVAL; 1656 } 1657 qdisc_refcount_inc(q); 1658 goto graft; 1659 } else { 1660 if (!q) 1661 goto create_n_graft; 1662 1663 /* This magic test requires explanation. 1664 * 1665 * We know, that some child q is already 1666 * attached to this parent and have choice: 1667 * 1) change it or 2) create/graft new one. 1668 * If the requested qdisc kind is different 1669 * than the existing one, then we choose graft. 1670 * If they are the same then this is "change" 1671 * operation - just let it fallthrough.. 1672 * 1673 * 1. We are allowed to create/graft only 1674 * if the request is explicitly stating 1675 * "please create if it doesn't exist". 1676 * 1677 * 2. If the request is to exclusive create 1678 * then the qdisc tcm_handle is not expected 1679 * to exist, so that we choose create/graft too. 1680 * 1681 * 3. The last case is when no flags are set. 1682 * This will happen when for example tc 1683 * utility issues a "change" command. 1684 * Alas, it is sort of hole in API, we 1685 * cannot decide what to do unambiguously. 1686 * For now we select create/graft. 1687 */ 1688 if (tca[TCA_KIND] && 1689 nla_strcmp(tca[TCA_KIND], q->ops->id)) { 1690 if (req_create_or_replace(n) || 1691 req_create_exclusive(n)) 1692 goto create_n_graft; 1693 else if (req_change(n)) 1694 goto create_n_graft2; 1695 } 1696 } 1697 } 1698 } else { 1699 if (!tcm->tcm_handle) { 1700 NL_SET_ERR_MSG(extack, "Handle cannot be zero"); 1701 return -EINVAL; 1702 } 1703 q = qdisc_lookup(dev, tcm->tcm_handle); 1704 } 1705 1706 /* Change qdisc parameters */ 1707 if (!q) { 1708 NL_SET_ERR_MSG(extack, "Specified qdisc not found"); 1709 return -ENOENT; 1710 } 1711 if (n->nlmsg_flags & NLM_F_EXCL) { 1712 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify"); 1713 return -EEXIST; 1714 } 1715 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { 1716 NL_SET_ERR_MSG(extack, "Invalid qdisc name: must match existing qdisc"); 1717 return -EINVAL; 1718 } 1719 err = qdisc_change(q, tca, extack); 1720 if (err == 0) 1721 qdisc_notify(sock_net(skb->sk), skb, n, clid, NULL, q, extack); 1722 return err; 1723 1724 create_n_graft: 1725 if (!(n->nlmsg_flags & NLM_F_CREATE)) { 1726 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag"); 1727 return -ENOENT; 1728 } 1729 create_n_graft2: 1730 if (clid == TC_H_INGRESS) { 1731 if (dev_ingress_queue(dev)) { 1732 q = qdisc_create(dev, dev_ingress_queue(dev), 1733 tcm->tcm_parent, tcm->tcm_parent, 1734 tca, &err, extack); 1735 } else { 1736 NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device"); 1737 err = -ENOENT; 1738 } 1739 } else { 1740 struct netdev_queue *dev_queue; 1741 1742 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue) 1743 dev_queue = p->ops->cl_ops->select_queue(p, tcm); 1744 else if (p) 1745 dev_queue = p->dev_queue; 1746 else 1747 dev_queue = netdev_get_tx_queue(dev, 0); 1748 1749 q = qdisc_create(dev, dev_queue, 1750 tcm->tcm_parent, tcm->tcm_handle, 1751 tca, &err, extack); 1752 } 1753 if (!q) 1754 return err; 1755 1756 graft: 1757 err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack); 1758 if (err) { 1759 if (q) 1760 qdisc_put(q); 1761 return err; 1762 } 1763 1764 return 0; 1765 } 1766 1767 static void request_qdisc_module(struct nlattr *kind) 1768 { 1769 struct Qdisc_ops *ops; 1770 char name[IFNAMSIZ]; 1771 1772 if (!kind) 1773 return; 1774 1775 ops = qdisc_lookup_ops(kind); 1776 if (ops) { 1777 bpf_module_put(ops, ops->owner); 1778 return; 1779 } 1780 1781 if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) { 1782 rtnl_unlock(); 1783 request_module(NET_SCH_ALIAS_PREFIX "%s", name); 1784 rtnl_lock(); 1785 } 1786 } 1787 1788 /* 1789 * Create/change qdisc. 1790 */ 1791 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, 1792 struct netlink_ext_ack *extack) 1793 { 1794 struct net *net = sock_net(skb->sk); 1795 struct nlattr *tca[TCA_MAX + 1]; 1796 struct net_device *dev; 1797 struct tcmsg *tcm; 1798 int err; 1799 1800 err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, 1801 rtm_tca_policy, extack); 1802 if (err < 0) 1803 return err; 1804 1805 request_qdisc_module(tca[TCA_KIND]); 1806 1807 tcm = nlmsg_data(n); 1808 dev = __dev_get_by_index(net, tcm->tcm_ifindex); 1809 if (!dev) 1810 return -ENODEV; 1811 1812 netdev_lock_ops(dev); 1813 err = __tc_modify_qdisc(skb, n, extack, dev, tca, tcm); 1814 netdev_unlock_ops(dev); 1815 1816 return err; 1817 } 1818 1819 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, 1820 struct netlink_callback *cb, 1821 int *q_idx_p, int s_q_idx, bool recur, 1822 bool dump_invisible) 1823 { 1824 int ret = 0, q_idx = *q_idx_p; 1825 struct Qdisc *q; 1826 int b; 1827 1828 if (!root) 1829 return 0; 1830 1831 q = root; 1832 if (q_idx < s_q_idx) { 1833 q_idx++; 1834 } else { 1835 if (!tc_qdisc_dump_ignore(q, dump_invisible) && 1836 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, 1837 cb->nlh->nlmsg_seq, NLM_F_MULTI, 1838 RTM_NEWQDISC, NULL) <= 0) 1839 goto done; 1840 q_idx++; 1841 } 1842 1843 /* If dumping singletons, there is no qdisc_dev(root) and the singleton 1844 * itself has already been dumped. 1845 * 1846 * If we've already dumped the top-level (ingress) qdisc above and the global 1847 * qdisc hashtable, we don't want to hit it again 1848 */ 1849 if (!qdisc_dev(root) || !recur) 1850 goto out; 1851 1852 hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) { 1853 if (q_idx < s_q_idx) { 1854 q_idx++; 1855 continue; 1856 } 1857 if (!tc_qdisc_dump_ignore(q, dump_invisible) && 1858 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, 1859 cb->nlh->nlmsg_seq, NLM_F_MULTI, 1860 RTM_NEWQDISC, NULL) <= 0) 1861 goto done; 1862 q_idx++; 1863 } 1864 1865 out: 1866 *q_idx_p = q_idx; 1867 return ret; 1868 done: 1869 ret = -1; 1870 goto out; 1871 } 1872 1873 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) 1874 { 1875 struct net *net = sock_net(skb->sk); 1876 int idx, q_idx; 1877 int s_idx, s_q_idx; 1878 struct net_device *dev; 1879 const struct nlmsghdr *nlh = cb->nlh; 1880 struct nlattr *tca[TCA_MAX + 1]; 1881 int err; 1882 1883 s_idx = cb->args[0]; 1884 s_q_idx = q_idx = cb->args[1]; 1885 1886 idx = 0; 1887 ASSERT_RTNL(); 1888 1889 err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX, 1890 rtm_tca_policy, cb->extack); 1891 if (err < 0) 1892 return err; 1893 1894 for_each_netdev(net, dev) { 1895 struct netdev_queue *dev_queue; 1896 1897 if (idx < s_idx) 1898 goto cont; 1899 if (idx > s_idx) 1900 s_q_idx = 0; 1901 q_idx = 0; 1902 1903 netdev_lock_ops(dev); 1904 if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc), 1905 skb, cb, &q_idx, s_q_idx, 1906 true, tca[TCA_DUMP_INVISIBLE]) < 0) { 1907 netdev_unlock_ops(dev); 1908 goto done; 1909 } 1910 1911 dev_queue = dev_ingress_queue(dev); 1912 if (dev_queue && 1913 tc_dump_qdisc_root(rtnl_dereference(dev_queue->qdisc_sleeping), 1914 skb, cb, &q_idx, s_q_idx, false, 1915 tca[TCA_DUMP_INVISIBLE]) < 0) { 1916 netdev_unlock_ops(dev); 1917 goto done; 1918 } 1919 netdev_unlock_ops(dev); 1920 1921 cont: 1922 idx++; 1923 } 1924 1925 done: 1926 cb->args[0] = idx; 1927 cb->args[1] = q_idx; 1928 1929 return skb->len; 1930 } 1931 1932 1933 1934 /************************************************ 1935 * Traffic classes manipulation. * 1936 ************************************************/ 1937 1938 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, 1939 unsigned long cl, u32 portid, u32 seq, u16 flags, 1940 int event, struct netlink_ext_ack *extack) 1941 { 1942 struct tcmsg *tcm; 1943 struct nlmsghdr *nlh; 1944 unsigned char *b = skb_tail_pointer(skb); 1945 struct gnet_dump d; 1946 const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops; 1947 1948 cond_resched(); 1949 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); 1950 if (!nlh) 1951 goto out_nlmsg_trim; 1952 tcm = nlmsg_data(nlh); 1953 tcm->tcm_family = AF_UNSPEC; 1954 tcm->tcm__pad1 = 0; 1955 tcm->tcm__pad2 = 0; 1956 tcm->tcm_ifindex = qdisc_dev(q)->ifindex; 1957 tcm->tcm_parent = q->handle; 1958 tcm->tcm_handle = q->handle; 1959 tcm->tcm_info = 0; 1960 if (nla_put_string(skb, TCA_KIND, q->ops->id)) 1961 goto nla_put_failure; 1962 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0) 1963 goto nla_put_failure; 1964 1965 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, 1966 NULL, &d, TCA_PAD) < 0) 1967 goto nla_put_failure; 1968 1969 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0) 1970 goto nla_put_failure; 1971 1972 if (gnet_stats_finish_copy(&d) < 0) 1973 goto nla_put_failure; 1974 1975 if (extack && extack->_msg && 1976 nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) 1977 goto out_nlmsg_trim; 1978 1979 nlh->nlmsg_len = skb_tail_pointer(skb) - b; 1980 1981 return skb->len; 1982 1983 out_nlmsg_trim: 1984 nla_put_failure: 1985 nlmsg_trim(skb, b); 1986 return -1; 1987 } 1988 1989 static int tclass_notify(struct net *net, struct sk_buff *oskb, 1990 struct nlmsghdr *n, struct Qdisc *q, 1991 unsigned long cl, int event, struct netlink_ext_ack *extack) 1992 { 1993 struct sk_buff *skb; 1994 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; 1995 1996 if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) 1997 return 0; 1998 1999 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 2000 if (!skb) 2001 return -ENOBUFS; 2002 2003 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event, extack) < 0) { 2004 kfree_skb(skb); 2005 return -EINVAL; 2006 } 2007 2008 return rtnetlink_send(skb, net, portid, RTNLGRP_TC, 2009 n->nlmsg_flags & NLM_F_ECHO); 2010 } 2011 2012 static int tclass_get_notify(struct net *net, struct sk_buff *oskb, 2013 struct nlmsghdr *n, struct Qdisc *q, 2014 unsigned long cl, struct netlink_ext_ack *extack) 2015 { 2016 struct sk_buff *skb; 2017 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; 2018 2019 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 2020 if (!skb) 2021 return -ENOBUFS; 2022 2023 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, RTM_NEWTCLASS, 2024 extack) < 0) { 2025 kfree_skb(skb); 2026 return -EINVAL; 2027 } 2028 2029 return rtnetlink_send(skb, net, portid, RTNLGRP_TC, 2030 n->nlmsg_flags & NLM_F_ECHO); 2031 } 2032 2033 static int tclass_del_notify(struct net *net, 2034 const struct Qdisc_class_ops *cops, 2035 struct sk_buff *oskb, struct nlmsghdr *n, 2036 struct Qdisc *q, unsigned long cl, 2037 struct netlink_ext_ack *extack) 2038 { 2039 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; 2040 struct sk_buff *skb; 2041 int err = 0; 2042 2043 if (!cops->delete) 2044 return -EOPNOTSUPP; 2045 2046 if (rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) { 2047 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 2048 if (!skb) 2049 return -ENOBUFS; 2050 2051 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, 2052 RTM_DELTCLASS, extack) < 0) { 2053 kfree_skb(skb); 2054 return -EINVAL; 2055 } 2056 } else { 2057 skb = NULL; 2058 } 2059 2060 err = cops->delete(q, cl, extack); 2061 if (err) { 2062 kfree_skb(skb); 2063 return err; 2064 } 2065 2066 err = rtnetlink_maybe_send(skb, net, portid, RTNLGRP_TC, 2067 n->nlmsg_flags & NLM_F_ECHO); 2068 return err; 2069 } 2070 2071 #ifdef CONFIG_NET_CLS 2072 2073 struct tcf_bind_args { 2074 struct tcf_walker w; 2075 unsigned long base; 2076 unsigned long cl; 2077 u32 classid; 2078 }; 2079 2080 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg) 2081 { 2082 struct tcf_bind_args *a = (void *)arg; 2083 2084 if (n && tp->ops->bind_class) { 2085 struct Qdisc *q = tcf_block_q(tp->chain->block); 2086 2087 sch_tree_lock(q); 2088 tp->ops->bind_class(n, a->classid, a->cl, q, a->base); 2089 sch_tree_unlock(q); 2090 } 2091 return 0; 2092 } 2093 2094 struct tc_bind_class_args { 2095 struct qdisc_walker w; 2096 unsigned long new_cl; 2097 u32 portid; 2098 u32 clid; 2099 }; 2100 2101 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl, 2102 struct qdisc_walker *w) 2103 { 2104 struct tc_bind_class_args *a = (struct tc_bind_class_args *)w; 2105 const struct Qdisc_class_ops *cops = q->ops->cl_ops; 2106 struct tcf_block *block; 2107 struct tcf_chain *chain; 2108 2109 block = cops->tcf_block(q, cl, NULL); 2110 if (!block) 2111 return 0; 2112 for (chain = tcf_get_next_chain(block, NULL); 2113 chain; 2114 chain = tcf_get_next_chain(block, chain)) { 2115 struct tcf_proto *tp; 2116 2117 for (tp = tcf_get_next_proto(chain, NULL); 2118 tp; tp = tcf_get_next_proto(chain, tp)) { 2119 struct tcf_bind_args arg = {}; 2120 2121 arg.w.fn = tcf_node_bind; 2122 arg.classid = a->clid; 2123 arg.base = cl; 2124 arg.cl = a->new_cl; 2125 tp->ops->walk(tp, &arg.w, true); 2126 } 2127 } 2128 2129 return 0; 2130 } 2131 2132 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, 2133 unsigned long new_cl) 2134 { 2135 const struct Qdisc_class_ops *cops = q->ops->cl_ops; 2136 struct tc_bind_class_args args = {}; 2137 2138 if (!cops->tcf_block) 2139 return; 2140 args.portid = portid; 2141 args.clid = clid; 2142 args.new_cl = new_cl; 2143 args.w.fn = tc_bind_class_walker; 2144 q->ops->cl_ops->walk(q, &args.w); 2145 } 2146 2147 #else 2148 2149 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, 2150 unsigned long new_cl) 2151 { 2152 } 2153 2154 #endif 2155 2156 static int __tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, 2157 struct netlink_ext_ack *extack, 2158 struct net_device *dev, 2159 struct nlattr *tca[TCA_MAX + 1], 2160 struct tcmsg *tcm) 2161 { 2162 struct net *net = sock_net(skb->sk); 2163 const struct Qdisc_class_ops *cops; 2164 struct Qdisc *q = NULL; 2165 unsigned long cl = 0; 2166 unsigned long new_cl; 2167 u32 portid; 2168 u32 clid; 2169 u32 qid; 2170 int err; 2171 2172 /* 2173 parent == TC_H_UNSPEC - unspecified parent. 2174 parent == TC_H_ROOT - class is root, which has no parent. 2175 parent == X:0 - parent is root class. 2176 parent == X:Y - parent is a node in hierarchy. 2177 parent == 0:Y - parent is X:Y, where X:0 is qdisc. 2178 2179 handle == 0:0 - generate handle from kernel pool. 2180 handle == 0:Y - class is X:Y, where X:0 is qdisc. 2181 handle == X:Y - clear. 2182 handle == X:0 - root class. 2183 */ 2184 2185 /* Step 1. Determine qdisc handle X:0 */ 2186 2187 portid = tcm->tcm_parent; 2188 clid = tcm->tcm_handle; 2189 qid = TC_H_MAJ(clid); 2190 2191 if (portid != TC_H_ROOT) { 2192 u32 qid1 = TC_H_MAJ(portid); 2193 2194 if (qid && qid1) { 2195 /* If both majors are known, they must be identical. */ 2196 if (qid != qid1) 2197 return -EINVAL; 2198 } else if (qid1) { 2199 qid = qid1; 2200 } else if (qid == 0) 2201 qid = rtnl_dereference(dev->qdisc)->handle; 2202 2203 /* Now qid is genuine qdisc handle consistent 2204 * both with parent and child. 2205 * 2206 * TC_H_MAJ(portid) still may be unspecified, complete it now. 2207 */ 2208 if (portid) 2209 portid = TC_H_MAKE(qid, portid); 2210 } else { 2211 if (qid == 0) 2212 qid = rtnl_dereference(dev->qdisc)->handle; 2213 } 2214 2215 /* OK. Locate qdisc */ 2216 q = qdisc_lookup(dev, qid); 2217 if (!q) 2218 return -ENOENT; 2219 2220 /* An check that it supports classes */ 2221 cops = q->ops->cl_ops; 2222 if (cops == NULL) 2223 return -EINVAL; 2224 2225 /* Now try to get class */ 2226 if (clid == 0) { 2227 if (portid == TC_H_ROOT) 2228 clid = qid; 2229 } else 2230 clid = TC_H_MAKE(qid, clid); 2231 2232 if (clid) 2233 cl = cops->find(q, clid); 2234 2235 if (cl == 0) { 2236 err = -ENOENT; 2237 if (n->nlmsg_type != RTM_NEWTCLASS || 2238 !(n->nlmsg_flags & NLM_F_CREATE)) 2239 goto out; 2240 } else { 2241 switch (n->nlmsg_type) { 2242 case RTM_NEWTCLASS: 2243 err = -EEXIST; 2244 if (n->nlmsg_flags & NLM_F_EXCL) 2245 goto out; 2246 break; 2247 case RTM_DELTCLASS: 2248 err = tclass_del_notify(net, cops, skb, n, q, cl, extack); 2249 /* Unbind the class with flilters with 0 */ 2250 tc_bind_tclass(q, portid, clid, 0); 2251 goto out; 2252 case RTM_GETTCLASS: 2253 err = tclass_get_notify(net, skb, n, q, cl, extack); 2254 goto out; 2255 default: 2256 err = -EINVAL; 2257 goto out; 2258 } 2259 } 2260 2261 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) { 2262 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes"); 2263 return -EOPNOTSUPP; 2264 } 2265 2266 /* Prevent creation of traffic classes with classid TC_H_ROOT */ 2267 if (clid == TC_H_ROOT) { 2268 NL_SET_ERR_MSG(extack, "Cannot create traffic class with classid TC_H_ROOT"); 2269 return -EINVAL; 2270 } 2271 2272 new_cl = cl; 2273 err = -EOPNOTSUPP; 2274 if (cops->change) 2275 err = cops->change(q, clid, portid, tca, &new_cl, extack); 2276 if (err == 0) { 2277 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS, extack); 2278 /* We just create a new class, need to do reverse binding. */ 2279 if (cl != new_cl) 2280 tc_bind_tclass(q, portid, clid, new_cl); 2281 } 2282 out: 2283 return err; 2284 } 2285 2286 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, 2287 struct netlink_ext_ack *extack) 2288 { 2289 struct net *net = sock_net(skb->sk); 2290 struct tcmsg *tcm = nlmsg_data(n); 2291 struct nlattr *tca[TCA_MAX + 1]; 2292 struct net_device *dev; 2293 int err; 2294 2295 err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, 2296 rtm_tca_policy, extack); 2297 if (err < 0) 2298 return err; 2299 2300 dev = __dev_get_by_index(net, tcm->tcm_ifindex); 2301 if (!dev) 2302 return -ENODEV; 2303 2304 netdev_lock_ops(dev); 2305 err = __tc_ctl_tclass(skb, n, extack, dev, tca, tcm); 2306 netdev_unlock_ops(dev); 2307 2308 return err; 2309 } 2310 2311 struct qdisc_dump_args { 2312 struct qdisc_walker w; 2313 struct sk_buff *skb; 2314 struct netlink_callback *cb; 2315 }; 2316 2317 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, 2318 struct qdisc_walker *arg) 2319 { 2320 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg; 2321 2322 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid, 2323 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, 2324 RTM_NEWTCLASS, NULL); 2325 } 2326 2327 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb, 2328 struct tcmsg *tcm, struct netlink_callback *cb, 2329 int *t_p, int s_t) 2330 { 2331 struct qdisc_dump_args arg; 2332 2333 if (tc_qdisc_dump_ignore(q, false) || 2334 *t_p < s_t || !q->ops->cl_ops || 2335 (tcm->tcm_parent && 2336 TC_H_MAJ(tcm->tcm_parent) != q->handle)) { 2337 (*t_p)++; 2338 return 0; 2339 } 2340 if (*t_p > s_t) 2341 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); 2342 arg.w.fn = qdisc_class_dump; 2343 arg.skb = skb; 2344 arg.cb = cb; 2345 arg.w.stop = 0; 2346 arg.w.skip = cb->args[1]; 2347 arg.w.count = 0; 2348 q->ops->cl_ops->walk(q, &arg.w); 2349 cb->args[1] = arg.w.count; 2350 if (arg.w.stop) 2351 return -1; 2352 (*t_p)++; 2353 return 0; 2354 } 2355 2356 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb, 2357 struct tcmsg *tcm, struct netlink_callback *cb, 2358 int *t_p, int s_t, bool recur) 2359 { 2360 struct Qdisc *q; 2361 int b; 2362 2363 if (!root) 2364 return 0; 2365 2366 if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0) 2367 return -1; 2368 2369 if (!qdisc_dev(root) || !recur) 2370 return 0; 2371 2372 if (tcm->tcm_parent) { 2373 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent)); 2374 if (q && q != root && 2375 tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0) 2376 return -1; 2377 return 0; 2378 } 2379 hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) { 2380 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0) 2381 return -1; 2382 } 2383 2384 return 0; 2385 } 2386 2387 static int __tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb, 2388 struct tcmsg *tcm, struct net_device *dev) 2389 { 2390 struct netdev_queue *dev_queue; 2391 int t, s_t; 2392 2393 s_t = cb->args[0]; 2394 t = 0; 2395 2396 if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc), 2397 skb, tcm, cb, &t, s_t, true) < 0) 2398 goto done; 2399 2400 dev_queue = dev_ingress_queue(dev); 2401 if (dev_queue && 2402 tc_dump_tclass_root(rtnl_dereference(dev_queue->qdisc_sleeping), 2403 skb, tcm, cb, &t, s_t, false) < 0) 2404 goto done; 2405 2406 done: 2407 cb->args[0] = t; 2408 2409 return skb->len; 2410 } 2411 2412 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) 2413 { 2414 struct tcmsg *tcm = nlmsg_data(cb->nlh); 2415 struct net *net = sock_net(skb->sk); 2416 struct net_device *dev; 2417 int err; 2418 2419 if (nlmsg_len(cb->nlh) < sizeof(*tcm)) 2420 return 0; 2421 2422 dev = dev_get_by_index(net, tcm->tcm_ifindex); 2423 if (!dev) 2424 return 0; 2425 2426 netdev_lock_ops(dev); 2427 err = __tc_dump_tclass(skb, cb, tcm, dev); 2428 netdev_unlock_ops(dev); 2429 2430 dev_put(dev); 2431 2432 return err; 2433 } 2434 2435 #ifdef CONFIG_PROC_FS 2436 static int psched_show(struct seq_file *seq, void *v) 2437 { 2438 seq_printf(seq, "%08x %08x %08x %08x\n", 2439 (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1), 2440 1000000, 2441 (u32)NSEC_PER_SEC / hrtimer_resolution); 2442 2443 return 0; 2444 } 2445 2446 static int __net_init psched_net_init(struct net *net) 2447 { 2448 struct proc_dir_entry *e; 2449 2450 e = proc_create_single("psched", 0, net->proc_net, psched_show); 2451 if (e == NULL) 2452 return -ENOMEM; 2453 2454 return 0; 2455 } 2456 2457 static void __net_exit psched_net_exit(struct net *net) 2458 { 2459 remove_proc_entry("psched", net->proc_net); 2460 } 2461 #else 2462 static int __net_init psched_net_init(struct net *net) 2463 { 2464 return 0; 2465 } 2466 2467 static void __net_exit psched_net_exit(struct net *net) 2468 { 2469 } 2470 #endif 2471 2472 static struct pernet_operations psched_net_ops = { 2473 .init = psched_net_init, 2474 .exit = psched_net_exit, 2475 }; 2476 2477 #if IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) 2478 DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper); 2479 #endif 2480 2481 static const struct rtnl_msg_handler psched_rtnl_msg_handlers[] __initconst = { 2482 {.msgtype = RTM_NEWQDISC, .doit = tc_modify_qdisc}, 2483 {.msgtype = RTM_DELQDISC, .doit = tc_get_qdisc}, 2484 {.msgtype = RTM_GETQDISC, .doit = tc_get_qdisc, 2485 .dumpit = tc_dump_qdisc}, 2486 {.msgtype = RTM_NEWTCLASS, .doit = tc_ctl_tclass}, 2487 {.msgtype = RTM_DELTCLASS, .doit = tc_ctl_tclass}, 2488 {.msgtype = RTM_GETTCLASS, .doit = tc_ctl_tclass, 2489 .dumpit = tc_dump_tclass}, 2490 }; 2491 2492 static int __init pktsched_init(void) 2493 { 2494 int err; 2495 2496 err = register_pernet_subsys(&psched_net_ops); 2497 if (err) { 2498 pr_err("pktsched_init: " 2499 "cannot initialize per netns operations\n"); 2500 return err; 2501 } 2502 2503 register_qdisc(&pfifo_fast_ops); 2504 register_qdisc(&pfifo_qdisc_ops); 2505 register_qdisc(&bfifo_qdisc_ops); 2506 register_qdisc(&pfifo_head_drop_qdisc_ops); 2507 register_qdisc(&mq_qdisc_ops); 2508 register_qdisc(&noqueue_qdisc_ops); 2509 2510 rtnl_register_many(psched_rtnl_msg_handlers); 2511 2512 tc_wrapper_init(); 2513 2514 return 0; 2515 } 2516 2517 subsys_initcall(pktsched_init); 2518