1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * net/sched/sch_api.c Packet scheduler API. 4 * 5 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 6 * 7 * Fixes: 8 * 9 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired. 10 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support 11 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support 12 */ 13 14 #include <linux/module.h> 15 #include <linux/types.h> 16 #include <linux/kernel.h> 17 #include <linux/string.h> 18 #include <linux/errno.h> 19 #include <linux/skbuff.h> 20 #include <linux/init.h> 21 #include <linux/proc_fs.h> 22 #include <linux/seq_file.h> 23 #include <linux/kmod.h> 24 #include <linux/list.h> 25 #include <linux/hrtimer.h> 26 #include <linux/slab.h> 27 #include <linux/hashtable.h> 28 29 #include <net/netdev_lock.h> 30 #include <net/net_namespace.h> 31 #include <net/sock.h> 32 #include <net/netlink.h> 33 #include <net/pkt_sched.h> 34 #include <net/pkt_cls.h> 35 #include <net/tc_wrapper.h> 36 37 #include <trace/events/qdisc.h> 38 39 /* 40 41 Short review. 42 ------------- 43 44 This file consists of two interrelated parts: 45 46 1. queueing disciplines manager frontend. 47 2. traffic classes manager frontend. 48 49 Generally, queueing discipline ("qdisc") is a black box, 50 which is able to enqueue packets and to dequeue them (when 51 device is ready to send something) in order and at times 52 determined by algorithm hidden in it. 53 54 qdisc's are divided to two categories: 55 - "queues", which have no internal structure visible from outside. 56 - "schedulers", which split all the packets to "traffic classes", 57 using "packet classifiers" (look at cls_api.c) 58 59 In turn, classes may have child qdiscs (as rule, queues) 60 attached to them etc. etc. etc. 61 62 The goal of the routines in this file is to translate 63 information supplied by user in the form of handles 64 to more intelligible for kernel form, to make some sanity 65 checks and part of work, which is common to all qdiscs 66 and to provide rtnetlink notifications. 67 68 All real intelligent work is done inside qdisc modules. 69 70 71 72 Every discipline has two major routines: enqueue and dequeue. 73 74 ---dequeue 75 76 dequeue usually returns a skb to send. It is allowed to return NULL, 77 but it does not mean that queue is empty, it just means that 78 discipline does not want to send anything this time. 79 Queue is really empty if q->q.qlen == 0. 80 For complicated disciplines with multiple queues q->q is not 81 real packet queue, but however q->q.qlen must be valid. 82 83 ---enqueue 84 85 enqueue returns 0, if packet was enqueued successfully. 86 If packet (this one or another one) was dropped, it returns 87 not zero error code. 88 NET_XMIT_DROP - this packet dropped 89 Expected action: do not backoff, but wait until queue will clear. 90 NET_XMIT_CN - probably this packet enqueued, but another one dropped. 91 Expected action: backoff or ignore 92 93 Auxiliary routines: 94 95 ---peek 96 97 like dequeue but without removing a packet from the queue 98 99 ---reset 100 101 returns qdisc to initial state: purge all buffers, clear all 102 timers, counters (except for statistics) etc. 103 104 ---init 105 106 initializes newly created qdisc. 107 108 ---destroy 109 110 destroys resources allocated by init and during lifetime of qdisc. 111 112 ---change 113 114 changes qdisc parameters. 115 */ 116 117 /* Protects list of registered TC modules. It is pure SMP lock. */ 118 static DEFINE_RWLOCK(qdisc_mod_lock); 119 120 121 /************************************************ 122 * Queueing disciplines manipulation. * 123 ************************************************/ 124 125 126 /* The list of all installed queueing disciplines. */ 127 128 static struct Qdisc_ops *qdisc_base; 129 130 /* Register/unregister queueing discipline */ 131 132 int register_qdisc(struct Qdisc_ops *qops) 133 { 134 struct Qdisc_ops *q, **qp; 135 int rc = -EEXIST; 136 137 write_lock(&qdisc_mod_lock); 138 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next) 139 if (!strcmp(qops->id, q->id)) 140 goto out; 141 142 if (qops->enqueue == NULL) 143 qops->enqueue = noop_qdisc_ops.enqueue; 144 if (qops->peek == NULL) { 145 if (qops->dequeue == NULL) 146 qops->peek = noop_qdisc_ops.peek; 147 else 148 goto out_einval; 149 } 150 if (qops->dequeue == NULL) 151 qops->dequeue = noop_qdisc_ops.dequeue; 152 153 if (qops->cl_ops) { 154 const struct Qdisc_class_ops *cops = qops->cl_ops; 155 156 if (!(cops->find && cops->walk && cops->leaf)) 157 goto out_einval; 158 159 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf)) 160 goto out_einval; 161 } 162 163 qops->next = NULL; 164 *qp = qops; 165 rc = 0; 166 out: 167 write_unlock(&qdisc_mod_lock); 168 return rc; 169 170 out_einval: 171 rc = -EINVAL; 172 goto out; 173 } 174 EXPORT_SYMBOL(register_qdisc); 175 176 void unregister_qdisc(struct Qdisc_ops *qops) 177 { 178 struct Qdisc_ops *q, **qp; 179 int err = -ENOENT; 180 181 write_lock(&qdisc_mod_lock); 182 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next) 183 if (q == qops) 184 break; 185 if (q) { 186 *qp = q->next; 187 q->next = NULL; 188 err = 0; 189 } 190 write_unlock(&qdisc_mod_lock); 191 192 WARN(err, "unregister qdisc(%s) failed\n", qops->id); 193 } 194 EXPORT_SYMBOL(unregister_qdisc); 195 196 /* Get default qdisc if not otherwise specified */ 197 void qdisc_get_default(char *name, size_t len) 198 { 199 read_lock(&qdisc_mod_lock); 200 strscpy(name, default_qdisc_ops->id, len); 201 read_unlock(&qdisc_mod_lock); 202 } 203 204 static struct Qdisc_ops *qdisc_lookup_default(const char *name) 205 { 206 struct Qdisc_ops *q = NULL; 207 208 for (q = qdisc_base; q; q = q->next) { 209 if (!strcmp(name, q->id)) { 210 if (!try_module_get(q->owner)) 211 q = NULL; 212 break; 213 } 214 } 215 216 return q; 217 } 218 219 /* Set new default qdisc to use */ 220 int qdisc_set_default(const char *name) 221 { 222 const struct Qdisc_ops *ops; 223 224 if (!capable(CAP_NET_ADMIN)) 225 return -EPERM; 226 227 write_lock(&qdisc_mod_lock); 228 ops = qdisc_lookup_default(name); 229 if (!ops) { 230 /* Not found, drop lock and try to load module */ 231 write_unlock(&qdisc_mod_lock); 232 request_module(NET_SCH_ALIAS_PREFIX "%s", name); 233 write_lock(&qdisc_mod_lock); 234 235 ops = qdisc_lookup_default(name); 236 } 237 238 if (ops) { 239 /* Set new default */ 240 module_put(default_qdisc_ops->owner); 241 default_qdisc_ops = ops; 242 } 243 write_unlock(&qdisc_mod_lock); 244 245 return ops ? 0 : -ENOENT; 246 } 247 248 #ifdef CONFIG_NET_SCH_DEFAULT 249 /* Set default value from kernel config */ 250 static int __init sch_default_qdisc(void) 251 { 252 return qdisc_set_default(CONFIG_DEFAULT_NET_SCH); 253 } 254 late_initcall(sch_default_qdisc); 255 #endif 256 257 /* We know handle. Find qdisc among all qdisc's attached to device 258 * (root qdisc, all its children, children of children etc.) 259 * Note: caller either uses rtnl or rcu_read_lock() 260 */ 261 262 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) 263 { 264 struct Qdisc *q; 265 266 if (!qdisc_dev(root)) 267 return (root->handle == handle ? root : NULL); 268 269 if (!(root->flags & TCQ_F_BUILTIN) && 270 root->handle == handle) 271 return root; 272 273 hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle, 274 lockdep_rtnl_is_held()) { 275 if (q->handle == handle) 276 return q; 277 } 278 return NULL; 279 } 280 281 void qdisc_hash_add(struct Qdisc *q, bool invisible) 282 { 283 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { 284 ASSERT_RTNL(); 285 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle); 286 if (invisible) 287 q->flags |= TCQ_F_INVISIBLE; 288 } 289 } 290 EXPORT_SYMBOL(qdisc_hash_add); 291 292 void qdisc_hash_del(struct Qdisc *q) 293 { 294 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { 295 ASSERT_RTNL(); 296 hash_del_rcu(&q->hash); 297 } 298 } 299 EXPORT_SYMBOL(qdisc_hash_del); 300 301 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) 302 { 303 struct Qdisc *q; 304 305 if (!handle) 306 return NULL; 307 q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle); 308 if (q) 309 goto out; 310 311 if (dev_ingress_queue(dev)) 312 q = qdisc_match_from_root( 313 rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping), 314 handle); 315 out: 316 return q; 317 } 318 319 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle) 320 { 321 struct netdev_queue *nq; 322 struct Qdisc *q; 323 324 if (!handle) 325 return NULL; 326 q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle); 327 if (q) 328 goto out; 329 330 nq = dev_ingress_queue_rcu(dev); 331 if (nq) 332 q = qdisc_match_from_root(rcu_dereference(nq->qdisc_sleeping), 333 handle); 334 out: 335 return q; 336 } 337 338 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) 339 { 340 unsigned long cl; 341 const struct Qdisc_class_ops *cops = p->ops->cl_ops; 342 343 if (cops == NULL) 344 return NULL; 345 cl = cops->find(p, classid); 346 347 if (cl == 0) 348 return NULL; 349 return cops->leaf(p, cl); 350 } 351 352 /* Find queueing discipline by name */ 353 354 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind) 355 { 356 struct Qdisc_ops *q = NULL; 357 358 if (kind) { 359 read_lock(&qdisc_mod_lock); 360 for (q = qdisc_base; q; q = q->next) { 361 if (nla_strcmp(kind, q->id) == 0) { 362 if (!try_module_get(q->owner)) 363 q = NULL; 364 break; 365 } 366 } 367 read_unlock(&qdisc_mod_lock); 368 } 369 return q; 370 } 371 372 /* The linklayer setting were not transferred from iproute2, in older 373 * versions, and the rate tables lookup systems have been dropped in 374 * the kernel. To keep backward compatible with older iproute2 tc 375 * utils, we detect the linklayer setting by detecting if the rate 376 * table were modified. 377 * 378 * For linklayer ATM table entries, the rate table will be aligned to 379 * 48 bytes, thus some table entries will contain the same value. The 380 * mpu (min packet unit) is also encoded into the old rate table, thus 381 * starting from the mpu, we find low and high table entries for 382 * mapping this cell. If these entries contain the same value, when 383 * the rate tables have been modified for linklayer ATM. 384 * 385 * This is done by rounding mpu to the nearest 48 bytes cell/entry, 386 * and then roundup to the next cell, calc the table entry one below, 387 * and compare. 388 */ 389 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab) 390 { 391 int low = roundup(r->mpu, 48); 392 int high = roundup(low+1, 48); 393 int cell_low = low >> r->cell_log; 394 int cell_high = (high >> r->cell_log) - 1; 395 396 /* rtab is too inaccurate at rates > 100Mbit/s */ 397 if ((r->rate > (100000000/8)) || (rtab[0] == 0)) { 398 pr_debug("TC linklayer: Giving up ATM detection\n"); 399 return TC_LINKLAYER_ETHERNET; 400 } 401 402 if ((cell_high > cell_low) && (cell_high < 256) 403 && (rtab[cell_low] == rtab[cell_high])) { 404 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n", 405 cell_low, cell_high, rtab[cell_high]); 406 return TC_LINKLAYER_ATM; 407 } 408 return TC_LINKLAYER_ETHERNET; 409 } 410 411 static struct qdisc_rate_table *qdisc_rtab_list; 412 413 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, 414 struct nlattr *tab, 415 struct netlink_ext_ack *extack) 416 { 417 struct qdisc_rate_table *rtab; 418 419 if (tab == NULL || r->rate == 0 || 420 r->cell_log == 0 || r->cell_log >= 32 || 421 nla_len(tab) != TC_RTAB_SIZE) { 422 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching"); 423 return NULL; 424 } 425 426 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) { 427 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) && 428 !memcmp(&rtab->data, nla_data(tab), 1024)) { 429 rtab->refcnt++; 430 return rtab; 431 } 432 } 433 434 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL); 435 if (rtab) { 436 rtab->rate = *r; 437 rtab->refcnt = 1; 438 memcpy(rtab->data, nla_data(tab), 1024); 439 if (r->linklayer == TC_LINKLAYER_UNAWARE) 440 r->linklayer = __detect_linklayer(r, rtab->data); 441 rtab->next = qdisc_rtab_list; 442 qdisc_rtab_list = rtab; 443 } else { 444 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table"); 445 } 446 return rtab; 447 } 448 EXPORT_SYMBOL(qdisc_get_rtab); 449 450 void qdisc_put_rtab(struct qdisc_rate_table *tab) 451 { 452 struct qdisc_rate_table *rtab, **rtabp; 453 454 if (!tab || --tab->refcnt) 455 return; 456 457 for (rtabp = &qdisc_rtab_list; 458 (rtab = *rtabp) != NULL; 459 rtabp = &rtab->next) { 460 if (rtab == tab) { 461 *rtabp = rtab->next; 462 kfree(rtab); 463 return; 464 } 465 } 466 } 467 EXPORT_SYMBOL(qdisc_put_rtab); 468 469 static LIST_HEAD(qdisc_stab_list); 470 471 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = { 472 [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) }, 473 [TCA_STAB_DATA] = { .type = NLA_BINARY }, 474 }; 475 476 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt, 477 struct netlink_ext_ack *extack) 478 { 479 struct nlattr *tb[TCA_STAB_MAX + 1]; 480 struct qdisc_size_table *stab; 481 struct tc_sizespec *s; 482 unsigned int tsize = 0; 483 u16 *tab = NULL; 484 int err; 485 486 err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy, 487 extack); 488 if (err < 0) 489 return ERR_PTR(err); 490 if (!tb[TCA_STAB_BASE]) { 491 NL_SET_ERR_MSG(extack, "Size table base attribute is missing"); 492 return ERR_PTR(-EINVAL); 493 } 494 495 s = nla_data(tb[TCA_STAB_BASE]); 496 497 if (s->tsize > 0) { 498 if (!tb[TCA_STAB_DATA]) { 499 NL_SET_ERR_MSG(extack, "Size table data attribute is missing"); 500 return ERR_PTR(-EINVAL); 501 } 502 tab = nla_data(tb[TCA_STAB_DATA]); 503 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16); 504 } 505 506 if (tsize != s->tsize || (!tab && tsize > 0)) { 507 NL_SET_ERR_MSG(extack, "Invalid size of size table"); 508 return ERR_PTR(-EINVAL); 509 } 510 511 list_for_each_entry(stab, &qdisc_stab_list, list) { 512 if (memcmp(&stab->szopts, s, sizeof(*s))) 513 continue; 514 if (tsize > 0 && 515 memcmp(stab->data, tab, flex_array_size(stab, data, tsize))) 516 continue; 517 stab->refcnt++; 518 return stab; 519 } 520 521 if (s->size_log > STAB_SIZE_LOG_MAX || 522 s->cell_log > STAB_SIZE_LOG_MAX) { 523 NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table"); 524 return ERR_PTR(-EINVAL); 525 } 526 527 stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL); 528 if (!stab) 529 return ERR_PTR(-ENOMEM); 530 531 stab->refcnt = 1; 532 stab->szopts = *s; 533 if (tsize > 0) 534 memcpy(stab->data, tab, flex_array_size(stab, data, tsize)); 535 536 list_add_tail(&stab->list, &qdisc_stab_list); 537 538 return stab; 539 } 540 541 void qdisc_put_stab(struct qdisc_size_table *tab) 542 { 543 if (!tab) 544 return; 545 546 if (--tab->refcnt == 0) { 547 list_del(&tab->list); 548 kfree_rcu(tab, rcu); 549 } 550 } 551 EXPORT_SYMBOL(qdisc_put_stab); 552 553 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab) 554 { 555 struct nlattr *nest; 556 557 nest = nla_nest_start_noflag(skb, TCA_STAB); 558 if (nest == NULL) 559 goto nla_put_failure; 560 if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts)) 561 goto nla_put_failure; 562 nla_nest_end(skb, nest); 563 564 return skb->len; 565 566 nla_put_failure: 567 return -1; 568 } 569 570 void __qdisc_calculate_pkt_len(struct sk_buff *skb, 571 const struct qdisc_size_table *stab) 572 { 573 int pkt_len, slot; 574 575 pkt_len = skb->len + stab->szopts.overhead; 576 if (unlikely(!stab->szopts.tsize)) 577 goto out; 578 579 slot = pkt_len + stab->szopts.cell_align; 580 if (unlikely(slot < 0)) 581 slot = 0; 582 583 slot >>= stab->szopts.cell_log; 584 if (likely(slot < stab->szopts.tsize)) 585 pkt_len = stab->data[slot]; 586 else 587 pkt_len = stab->data[stab->szopts.tsize - 1] * 588 (slot / stab->szopts.tsize) + 589 stab->data[slot % stab->szopts.tsize]; 590 591 pkt_len <<= stab->szopts.size_log; 592 out: 593 if (unlikely(pkt_len < 1)) 594 pkt_len = 1; 595 qdisc_skb_cb(skb)->pkt_len = pkt_len; 596 } 597 598 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc) 599 { 600 if (!(qdisc->flags & TCQ_F_WARN_NONWC)) { 601 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n", 602 txt, qdisc->ops->id, qdisc->handle >> 16); 603 qdisc->flags |= TCQ_F_WARN_NONWC; 604 } 605 } 606 EXPORT_SYMBOL(qdisc_warn_nonwc); 607 608 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) 609 { 610 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog, 611 timer); 612 613 rcu_read_lock(); 614 __netif_schedule(qdisc_root(wd->qdisc)); 615 rcu_read_unlock(); 616 617 return HRTIMER_NORESTART; 618 } 619 620 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, 621 clockid_t clockid) 622 { 623 hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED); 624 wd->timer.function = qdisc_watchdog; 625 wd->qdisc = qdisc; 626 } 627 EXPORT_SYMBOL(qdisc_watchdog_init_clockid); 628 629 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) 630 { 631 qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC); 632 } 633 EXPORT_SYMBOL(qdisc_watchdog_init); 634 635 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires, 636 u64 delta_ns) 637 { 638 bool deactivated; 639 640 rcu_read_lock(); 641 deactivated = test_bit(__QDISC_STATE_DEACTIVATED, 642 &qdisc_root_sleeping(wd->qdisc)->state); 643 rcu_read_unlock(); 644 if (deactivated) 645 return; 646 647 if (hrtimer_is_queued(&wd->timer)) { 648 u64 softexpires; 649 650 softexpires = ktime_to_ns(hrtimer_get_softexpires(&wd->timer)); 651 /* If timer is already set in [expires, expires + delta_ns], 652 * do not reprogram it. 653 */ 654 if (softexpires - expires <= delta_ns) 655 return; 656 } 657 658 hrtimer_start_range_ns(&wd->timer, 659 ns_to_ktime(expires), 660 delta_ns, 661 HRTIMER_MODE_ABS_PINNED); 662 } 663 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns); 664 665 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd) 666 { 667 hrtimer_cancel(&wd->timer); 668 } 669 EXPORT_SYMBOL(qdisc_watchdog_cancel); 670 671 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n) 672 { 673 struct hlist_head *h; 674 unsigned int i; 675 676 h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL); 677 678 if (h != NULL) { 679 for (i = 0; i < n; i++) 680 INIT_HLIST_HEAD(&h[i]); 681 } 682 return h; 683 } 684 685 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash) 686 { 687 struct Qdisc_class_common *cl; 688 struct hlist_node *next; 689 struct hlist_head *nhash, *ohash; 690 unsigned int nsize, nmask, osize; 691 unsigned int i, h; 692 693 /* Rehash when load factor exceeds 0.75 */ 694 if (clhash->hashelems * 4 <= clhash->hashsize * 3) 695 return; 696 nsize = clhash->hashsize * 2; 697 nmask = nsize - 1; 698 nhash = qdisc_class_hash_alloc(nsize); 699 if (nhash == NULL) 700 return; 701 702 ohash = clhash->hash; 703 osize = clhash->hashsize; 704 705 sch_tree_lock(sch); 706 for (i = 0; i < osize; i++) { 707 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) { 708 h = qdisc_class_hash(cl->classid, nmask); 709 hlist_add_head(&cl->hnode, &nhash[h]); 710 } 711 } 712 clhash->hash = nhash; 713 clhash->hashsize = nsize; 714 clhash->hashmask = nmask; 715 sch_tree_unlock(sch); 716 717 kvfree(ohash); 718 } 719 EXPORT_SYMBOL(qdisc_class_hash_grow); 720 721 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash) 722 { 723 unsigned int size = 4; 724 725 clhash->hash = qdisc_class_hash_alloc(size); 726 if (!clhash->hash) 727 return -ENOMEM; 728 clhash->hashsize = size; 729 clhash->hashmask = size - 1; 730 clhash->hashelems = 0; 731 return 0; 732 } 733 EXPORT_SYMBOL(qdisc_class_hash_init); 734 735 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash) 736 { 737 kvfree(clhash->hash); 738 } 739 EXPORT_SYMBOL(qdisc_class_hash_destroy); 740 741 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash, 742 struct Qdisc_class_common *cl) 743 { 744 unsigned int h; 745 746 INIT_HLIST_NODE(&cl->hnode); 747 h = qdisc_class_hash(cl->classid, clhash->hashmask); 748 hlist_add_head(&cl->hnode, &clhash->hash[h]); 749 clhash->hashelems++; 750 } 751 EXPORT_SYMBOL(qdisc_class_hash_insert); 752 753 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash, 754 struct Qdisc_class_common *cl) 755 { 756 hlist_del(&cl->hnode); 757 clhash->hashelems--; 758 } 759 EXPORT_SYMBOL(qdisc_class_hash_remove); 760 761 /* Allocate an unique handle from space managed by kernel 762 * Possible range is [8000-FFFF]:0000 (0x8000 values) 763 */ 764 static u32 qdisc_alloc_handle(struct net_device *dev) 765 { 766 int i = 0x8000; 767 static u32 autohandle = TC_H_MAKE(0x80000000U, 0); 768 769 do { 770 autohandle += TC_H_MAKE(0x10000U, 0); 771 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0)) 772 autohandle = TC_H_MAKE(0x80000000U, 0); 773 if (!qdisc_lookup(dev, autohandle)) 774 return autohandle; 775 cond_resched(); 776 } while (--i > 0); 777 778 return 0; 779 } 780 781 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len) 782 { 783 bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED; 784 const struct Qdisc_class_ops *cops; 785 unsigned long cl; 786 u32 parentid; 787 bool notify; 788 int drops; 789 790 if (n == 0 && len == 0) 791 return; 792 drops = max_t(int, n, 0); 793 rcu_read_lock(); 794 while ((parentid = sch->parent)) { 795 if (parentid == TC_H_ROOT) 796 break; 797 798 if (sch->flags & TCQ_F_NOPARENT) 799 break; 800 /* Notify parent qdisc only if child qdisc becomes empty. 801 * 802 * If child was empty even before update then backlog 803 * counter is screwed and we skip notification because 804 * parent class is already passive. 805 * 806 * If the original child was offloaded then it is allowed 807 * to be seem as empty, so the parent is notified anyway. 808 */ 809 notify = !sch->q.qlen && !WARN_ON_ONCE(!n && 810 !qdisc_is_offloaded); 811 /* TODO: perform the search on a per txq basis */ 812 sch = qdisc_lookup_rcu(qdisc_dev(sch), TC_H_MAJ(parentid)); 813 if (sch == NULL) { 814 WARN_ON_ONCE(parentid != TC_H_ROOT); 815 break; 816 } 817 cops = sch->ops->cl_ops; 818 if (notify && cops->qlen_notify) { 819 cl = cops->find(sch, parentid); 820 cops->qlen_notify(sch, cl); 821 } 822 sch->q.qlen -= n; 823 sch->qstats.backlog -= len; 824 __qdisc_qstats_drop(sch, drops); 825 } 826 rcu_read_unlock(); 827 } 828 EXPORT_SYMBOL(qdisc_tree_reduce_backlog); 829 830 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type, 831 void *type_data) 832 { 833 struct net_device *dev = qdisc_dev(sch); 834 int err; 835 836 sch->flags &= ~TCQ_F_OFFLOADED; 837 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) 838 return 0; 839 840 err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data); 841 if (err == -EOPNOTSUPP) 842 return 0; 843 844 if (!err) 845 sch->flags |= TCQ_F_OFFLOADED; 846 847 return err; 848 } 849 EXPORT_SYMBOL(qdisc_offload_dump_helper); 850 851 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch, 852 struct Qdisc *new, struct Qdisc *old, 853 enum tc_setup_type type, void *type_data, 854 struct netlink_ext_ack *extack) 855 { 856 bool any_qdisc_is_offloaded; 857 int err; 858 859 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) 860 return; 861 862 err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data); 863 864 /* Don't report error if the graft is part of destroy operation. */ 865 if (!err || !new || new == &noop_qdisc) 866 return; 867 868 /* Don't report error if the parent, the old child and the new 869 * one are not offloaded. 870 */ 871 any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED; 872 any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED; 873 any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED; 874 875 if (any_qdisc_is_offloaded) 876 NL_SET_ERR_MSG(extack, "Offloading graft operation failed."); 877 } 878 EXPORT_SYMBOL(qdisc_offload_graft_helper); 879 880 void qdisc_offload_query_caps(struct net_device *dev, 881 enum tc_setup_type type, 882 void *caps, size_t caps_len) 883 { 884 const struct net_device_ops *ops = dev->netdev_ops; 885 struct tc_query_caps_base base = { 886 .type = type, 887 .caps = caps, 888 }; 889 890 memset(caps, 0, caps_len); 891 892 if (ops->ndo_setup_tc) 893 ops->ndo_setup_tc(dev, TC_QUERY_CAPS, &base); 894 } 895 EXPORT_SYMBOL(qdisc_offload_query_caps); 896 897 static void qdisc_offload_graft_root(struct net_device *dev, 898 struct Qdisc *new, struct Qdisc *old, 899 struct netlink_ext_ack *extack) 900 { 901 struct tc_root_qopt_offload graft_offload = { 902 .command = TC_ROOT_GRAFT, 903 .handle = new ? new->handle : 0, 904 .ingress = (new && new->flags & TCQ_F_INGRESS) || 905 (old && old->flags & TCQ_F_INGRESS), 906 }; 907 908 qdisc_offload_graft_helper(dev, NULL, new, old, 909 TC_SETUP_ROOT_QDISC, &graft_offload, extack); 910 } 911 912 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, 913 u32 portid, u32 seq, u16 flags, int event, 914 struct netlink_ext_ack *extack) 915 { 916 struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL; 917 struct gnet_stats_queue __percpu *cpu_qstats = NULL; 918 struct tcmsg *tcm; 919 struct nlmsghdr *nlh; 920 unsigned char *b = skb_tail_pointer(skb); 921 struct gnet_dump d; 922 struct qdisc_size_table *stab; 923 u32 block_index; 924 __u32 qlen; 925 926 cond_resched(); 927 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); 928 if (!nlh) 929 goto out_nlmsg_trim; 930 tcm = nlmsg_data(nlh); 931 tcm->tcm_family = AF_UNSPEC; 932 tcm->tcm__pad1 = 0; 933 tcm->tcm__pad2 = 0; 934 tcm->tcm_ifindex = qdisc_dev(q)->ifindex; 935 tcm->tcm_parent = clid; 936 tcm->tcm_handle = q->handle; 937 tcm->tcm_info = refcount_read(&q->refcnt); 938 if (nla_put_string(skb, TCA_KIND, q->ops->id)) 939 goto nla_put_failure; 940 if (q->ops->ingress_block_get) { 941 block_index = q->ops->ingress_block_get(q); 942 if (block_index && 943 nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index)) 944 goto nla_put_failure; 945 } 946 if (q->ops->egress_block_get) { 947 block_index = q->ops->egress_block_get(q); 948 if (block_index && 949 nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index)) 950 goto nla_put_failure; 951 } 952 if (q->ops->dump && q->ops->dump(q, skb) < 0) 953 goto nla_put_failure; 954 if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED))) 955 goto nla_put_failure; 956 qlen = qdisc_qlen_sum(q); 957 958 stab = rtnl_dereference(q->stab); 959 if (stab && qdisc_dump_stab(skb, stab) < 0) 960 goto nla_put_failure; 961 962 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, 963 NULL, &d, TCA_PAD) < 0) 964 goto nla_put_failure; 965 966 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) 967 goto nla_put_failure; 968 969 if (qdisc_is_percpu_stats(q)) { 970 cpu_bstats = q->cpu_bstats; 971 cpu_qstats = q->cpu_qstats; 972 } 973 974 if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 || 975 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 || 976 gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0) 977 goto nla_put_failure; 978 979 if (gnet_stats_finish_copy(&d) < 0) 980 goto nla_put_failure; 981 982 if (extack && extack->_msg && 983 nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) 984 goto out_nlmsg_trim; 985 986 nlh->nlmsg_len = skb_tail_pointer(skb) - b; 987 988 return skb->len; 989 990 out_nlmsg_trim: 991 nla_put_failure: 992 nlmsg_trim(skb, b); 993 return -1; 994 } 995 996 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible) 997 { 998 if (q->flags & TCQ_F_BUILTIN) 999 return true; 1000 if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible) 1001 return true; 1002 1003 return false; 1004 } 1005 1006 static int qdisc_get_notify(struct net *net, struct sk_buff *oskb, 1007 struct nlmsghdr *n, u32 clid, struct Qdisc *q, 1008 struct netlink_ext_ack *extack) 1009 { 1010 struct sk_buff *skb; 1011 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; 1012 1013 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 1014 if (!skb) 1015 return -ENOBUFS; 1016 1017 if (!tc_qdisc_dump_ignore(q, false)) { 1018 if (tc_fill_qdisc(skb, q, clid, portid, n->nlmsg_seq, 0, 1019 RTM_NEWQDISC, extack) < 0) 1020 goto err_out; 1021 } 1022 1023 if (skb->len) 1024 return rtnetlink_send(skb, net, portid, RTNLGRP_TC, 1025 n->nlmsg_flags & NLM_F_ECHO); 1026 1027 err_out: 1028 kfree_skb(skb); 1029 return -EINVAL; 1030 } 1031 1032 static int qdisc_notify(struct net *net, struct sk_buff *oskb, 1033 struct nlmsghdr *n, u32 clid, 1034 struct Qdisc *old, struct Qdisc *new, 1035 struct netlink_ext_ack *extack) 1036 { 1037 struct sk_buff *skb; 1038 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; 1039 1040 if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) 1041 return 0; 1042 1043 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 1044 if (!skb) 1045 return -ENOBUFS; 1046 1047 if (old && !tc_qdisc_dump_ignore(old, false)) { 1048 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq, 1049 0, RTM_DELQDISC, extack) < 0) 1050 goto err_out; 1051 } 1052 if (new && !tc_qdisc_dump_ignore(new, false)) { 1053 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq, 1054 old ? NLM_F_REPLACE : 0, RTM_NEWQDISC, extack) < 0) 1055 goto err_out; 1056 } 1057 1058 if (skb->len) 1059 return rtnetlink_send(skb, net, portid, RTNLGRP_TC, 1060 n->nlmsg_flags & NLM_F_ECHO); 1061 1062 err_out: 1063 kfree_skb(skb); 1064 return -EINVAL; 1065 } 1066 1067 static void notify_and_destroy(struct net *net, struct sk_buff *skb, 1068 struct nlmsghdr *n, u32 clid, 1069 struct Qdisc *old, struct Qdisc *new, 1070 struct netlink_ext_ack *extack) 1071 { 1072 if (new || old) 1073 qdisc_notify(net, skb, n, clid, old, new, extack); 1074 1075 if (old) 1076 qdisc_put(old); 1077 } 1078 1079 static void qdisc_clear_nolock(struct Qdisc *sch) 1080 { 1081 sch->flags &= ~TCQ_F_NOLOCK; 1082 if (!(sch->flags & TCQ_F_CPUSTATS)) 1083 return; 1084 1085 free_percpu(sch->cpu_bstats); 1086 free_percpu(sch->cpu_qstats); 1087 sch->cpu_bstats = NULL; 1088 sch->cpu_qstats = NULL; 1089 sch->flags &= ~TCQ_F_CPUSTATS; 1090 } 1091 1092 /* Graft qdisc "new" to class "classid" of qdisc "parent" or 1093 * to device "dev". 1094 * 1095 * When appropriate send a netlink notification using 'skb' 1096 * and "n". 1097 * 1098 * On success, destroy old qdisc. 1099 */ 1100 1101 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, 1102 struct sk_buff *skb, struct nlmsghdr *n, u32 classid, 1103 struct Qdisc *new, struct Qdisc *old, 1104 struct netlink_ext_ack *extack) 1105 { 1106 struct Qdisc *q = old; 1107 struct net *net = dev_net(dev); 1108 1109 if (parent == NULL) { 1110 unsigned int i, num_q, ingress; 1111 struct netdev_queue *dev_queue; 1112 1113 ingress = 0; 1114 num_q = dev->num_tx_queues; 1115 if ((q && q->flags & TCQ_F_INGRESS) || 1116 (new && new->flags & TCQ_F_INGRESS)) { 1117 ingress = 1; 1118 dev_queue = dev_ingress_queue(dev); 1119 if (!dev_queue) { 1120 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue"); 1121 return -ENOENT; 1122 } 1123 1124 q = rtnl_dereference(dev_queue->qdisc_sleeping); 1125 1126 /* This is the counterpart of that qdisc_refcount_inc_nz() call in 1127 * __tcf_qdisc_find() for filter requests. 1128 */ 1129 if (!qdisc_refcount_dec_if_one(q)) { 1130 NL_SET_ERR_MSG(extack, 1131 "Current ingress or clsact Qdisc has ongoing filter requests"); 1132 return -EBUSY; 1133 } 1134 } 1135 1136 if (dev->flags & IFF_UP) 1137 dev_deactivate(dev); 1138 1139 qdisc_offload_graft_root(dev, new, old, extack); 1140 1141 if (new && new->ops->attach && !ingress) 1142 goto skip; 1143 1144 if (!ingress) { 1145 for (i = 0; i < num_q; i++) { 1146 dev_queue = netdev_get_tx_queue(dev, i); 1147 old = dev_graft_qdisc(dev_queue, new); 1148 1149 if (new && i > 0) 1150 qdisc_refcount_inc(new); 1151 qdisc_put(old); 1152 } 1153 } else { 1154 old = dev_graft_qdisc(dev_queue, NULL); 1155 1156 /* {ingress,clsact}_destroy() @old before grafting @new to avoid 1157 * unprotected concurrent accesses to net_device::miniq_{in,e}gress 1158 * pointer(s) in mini_qdisc_pair_swap(). 1159 */ 1160 qdisc_notify(net, skb, n, classid, old, new, extack); 1161 qdisc_destroy(old); 1162 1163 dev_graft_qdisc(dev_queue, new); 1164 } 1165 1166 skip: 1167 if (!ingress) { 1168 old = rtnl_dereference(dev->qdisc); 1169 if (new && !new->ops->attach) 1170 qdisc_refcount_inc(new); 1171 rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc); 1172 1173 notify_and_destroy(net, skb, n, classid, old, new, extack); 1174 1175 if (new && new->ops->attach) 1176 new->ops->attach(new); 1177 } 1178 1179 if (dev->flags & IFF_UP) 1180 dev_activate(dev); 1181 } else { 1182 const struct Qdisc_class_ops *cops = parent->ops->cl_ops; 1183 unsigned long cl; 1184 int err; 1185 1186 /* Only support running class lockless if parent is lockless */ 1187 if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK)) 1188 qdisc_clear_nolock(new); 1189 1190 if (!cops || !cops->graft) 1191 return -EOPNOTSUPP; 1192 1193 cl = cops->find(parent, classid); 1194 if (!cl) { 1195 NL_SET_ERR_MSG(extack, "Specified class not found"); 1196 return -ENOENT; 1197 } 1198 1199 if (new && new->ops == &noqueue_qdisc_ops) { 1200 NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class"); 1201 return -EINVAL; 1202 } 1203 1204 if (new && 1205 !(parent->flags & TCQ_F_MQROOT) && 1206 rcu_access_pointer(new->stab)) { 1207 NL_SET_ERR_MSG(extack, "STAB not supported on a non root"); 1208 return -EINVAL; 1209 } 1210 err = cops->graft(parent, cl, new, &old, extack); 1211 if (err) 1212 return err; 1213 notify_and_destroy(net, skb, n, classid, old, new, extack); 1214 } 1215 return 0; 1216 } 1217 1218 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca, 1219 struct netlink_ext_ack *extack) 1220 { 1221 u32 block_index; 1222 1223 if (tca[TCA_INGRESS_BLOCK]) { 1224 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]); 1225 1226 if (!block_index) { 1227 NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0"); 1228 return -EINVAL; 1229 } 1230 if (!sch->ops->ingress_block_set) { 1231 NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported"); 1232 return -EOPNOTSUPP; 1233 } 1234 sch->ops->ingress_block_set(sch, block_index); 1235 } 1236 if (tca[TCA_EGRESS_BLOCK]) { 1237 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]); 1238 1239 if (!block_index) { 1240 NL_SET_ERR_MSG(extack, "Egress block index cannot be 0"); 1241 return -EINVAL; 1242 } 1243 if (!sch->ops->egress_block_set) { 1244 NL_SET_ERR_MSG(extack, "Egress block sharing is not supported"); 1245 return -EOPNOTSUPP; 1246 } 1247 sch->ops->egress_block_set(sch, block_index); 1248 } 1249 return 0; 1250 } 1251 1252 /* 1253 Allocate and initialize new qdisc. 1254 1255 Parameters are passed via opt. 1256 */ 1257 1258 static struct Qdisc *qdisc_create(struct net_device *dev, 1259 struct netdev_queue *dev_queue, 1260 u32 parent, u32 handle, 1261 struct nlattr **tca, int *errp, 1262 struct netlink_ext_ack *extack) 1263 { 1264 int err; 1265 struct nlattr *kind = tca[TCA_KIND]; 1266 struct Qdisc *sch; 1267 struct Qdisc_ops *ops; 1268 struct qdisc_size_table *stab; 1269 1270 ops = qdisc_lookup_ops(kind); 1271 #ifdef CONFIG_MODULES 1272 if (ops == NULL && kind != NULL) { 1273 char name[IFNAMSIZ]; 1274 if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) { 1275 /* We dropped the RTNL semaphore in order to 1276 * perform the module load. So, even if we 1277 * succeeded in loading the module we have to 1278 * tell the caller to replay the request. We 1279 * indicate this using -EAGAIN. 1280 * We replay the request because the device may 1281 * go away in the mean time. 1282 */ 1283 netdev_unlock_ops(dev); 1284 rtnl_unlock(); 1285 request_module(NET_SCH_ALIAS_PREFIX "%s", name); 1286 rtnl_lock(); 1287 netdev_lock_ops(dev); 1288 ops = qdisc_lookup_ops(kind); 1289 if (ops != NULL) { 1290 /* We will try again qdisc_lookup_ops, 1291 * so don't keep a reference. 1292 */ 1293 module_put(ops->owner); 1294 err = -EAGAIN; 1295 goto err_out; 1296 } 1297 } 1298 } 1299 #endif 1300 1301 err = -ENOENT; 1302 if (!ops) { 1303 NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown"); 1304 goto err_out; 1305 } 1306 1307 sch = qdisc_alloc(dev_queue, ops, extack); 1308 if (IS_ERR(sch)) { 1309 err = PTR_ERR(sch); 1310 goto err_out2; 1311 } 1312 1313 sch->parent = parent; 1314 1315 if (handle == TC_H_INGRESS) { 1316 if (!(sch->flags & TCQ_F_INGRESS)) { 1317 NL_SET_ERR_MSG(extack, 1318 "Specified parent ID is reserved for ingress and clsact Qdiscs"); 1319 err = -EINVAL; 1320 goto err_out3; 1321 } 1322 handle = TC_H_MAKE(TC_H_INGRESS, 0); 1323 } else { 1324 if (handle == 0) { 1325 handle = qdisc_alloc_handle(dev); 1326 if (handle == 0) { 1327 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded"); 1328 err = -ENOSPC; 1329 goto err_out3; 1330 } 1331 } 1332 if (!netif_is_multiqueue(dev)) 1333 sch->flags |= TCQ_F_ONETXQUEUE; 1334 } 1335 1336 sch->handle = handle; 1337 1338 /* This exist to keep backward compatible with a userspace 1339 * loophole, what allowed userspace to get IFF_NO_QUEUE 1340 * facility on older kernels by setting tx_queue_len=0 (prior 1341 * to qdisc init), and then forgot to reinit tx_queue_len 1342 * before again attaching a qdisc. 1343 */ 1344 if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) { 1345 WRITE_ONCE(dev->tx_queue_len, DEFAULT_TX_QUEUE_LEN); 1346 netdev_info(dev, "Caught tx_queue_len zero misconfig\n"); 1347 } 1348 1349 err = qdisc_block_indexes_set(sch, tca, extack); 1350 if (err) 1351 goto err_out3; 1352 1353 if (tca[TCA_STAB]) { 1354 stab = qdisc_get_stab(tca[TCA_STAB], extack); 1355 if (IS_ERR(stab)) { 1356 err = PTR_ERR(stab); 1357 goto err_out3; 1358 } 1359 rcu_assign_pointer(sch->stab, stab); 1360 } 1361 1362 if (ops->init) { 1363 err = ops->init(sch, tca[TCA_OPTIONS], extack); 1364 if (err != 0) 1365 goto err_out4; 1366 } 1367 1368 if (tca[TCA_RATE]) { 1369 err = -EOPNOTSUPP; 1370 if (sch->flags & TCQ_F_MQROOT) { 1371 NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc"); 1372 goto err_out4; 1373 } 1374 1375 err = gen_new_estimator(&sch->bstats, 1376 sch->cpu_bstats, 1377 &sch->rate_est, 1378 NULL, 1379 true, 1380 tca[TCA_RATE]); 1381 if (err) { 1382 NL_SET_ERR_MSG(extack, "Failed to generate new estimator"); 1383 goto err_out4; 1384 } 1385 } 1386 1387 qdisc_hash_add(sch, false); 1388 trace_qdisc_create(ops, dev, parent); 1389 1390 return sch; 1391 1392 err_out4: 1393 /* Even if ops->init() failed, we call ops->destroy() 1394 * like qdisc_create_dflt(). 1395 */ 1396 if (ops->destroy) 1397 ops->destroy(sch); 1398 qdisc_put_stab(rtnl_dereference(sch->stab)); 1399 err_out3: 1400 lockdep_unregister_key(&sch->root_lock_key); 1401 netdev_put(dev, &sch->dev_tracker); 1402 qdisc_free(sch); 1403 err_out2: 1404 module_put(ops->owner); 1405 err_out: 1406 *errp = err; 1407 return NULL; 1408 } 1409 1410 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca, 1411 struct netlink_ext_ack *extack) 1412 { 1413 struct qdisc_size_table *ostab, *stab = NULL; 1414 int err = 0; 1415 1416 if (tca[TCA_OPTIONS]) { 1417 if (!sch->ops->change) { 1418 NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc"); 1419 return -EINVAL; 1420 } 1421 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) { 1422 NL_SET_ERR_MSG(extack, "Change of blocks is not supported"); 1423 return -EOPNOTSUPP; 1424 } 1425 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack); 1426 if (err) 1427 return err; 1428 } 1429 1430 if (tca[TCA_STAB]) { 1431 stab = qdisc_get_stab(tca[TCA_STAB], extack); 1432 if (IS_ERR(stab)) 1433 return PTR_ERR(stab); 1434 } 1435 1436 ostab = rtnl_dereference(sch->stab); 1437 rcu_assign_pointer(sch->stab, stab); 1438 qdisc_put_stab(ostab); 1439 1440 if (tca[TCA_RATE]) { 1441 /* NB: ignores errors from replace_estimator 1442 because change can't be undone. */ 1443 if (sch->flags & TCQ_F_MQROOT) 1444 goto out; 1445 gen_replace_estimator(&sch->bstats, 1446 sch->cpu_bstats, 1447 &sch->rate_est, 1448 NULL, 1449 true, 1450 tca[TCA_RATE]); 1451 } 1452 out: 1453 return 0; 1454 } 1455 1456 struct check_loop_arg { 1457 struct qdisc_walker w; 1458 struct Qdisc *p; 1459 int depth; 1460 }; 1461 1462 static int check_loop_fn(struct Qdisc *q, unsigned long cl, 1463 struct qdisc_walker *w); 1464 1465 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth) 1466 { 1467 struct check_loop_arg arg; 1468 1469 if (q->ops->cl_ops == NULL) 1470 return 0; 1471 1472 arg.w.stop = arg.w.skip = arg.w.count = 0; 1473 arg.w.fn = check_loop_fn; 1474 arg.depth = depth; 1475 arg.p = p; 1476 q->ops->cl_ops->walk(q, &arg.w); 1477 return arg.w.stop ? -ELOOP : 0; 1478 } 1479 1480 static int 1481 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) 1482 { 1483 struct Qdisc *leaf; 1484 const struct Qdisc_class_ops *cops = q->ops->cl_ops; 1485 struct check_loop_arg *arg = (struct check_loop_arg *)w; 1486 1487 leaf = cops->leaf(q, cl); 1488 if (leaf) { 1489 if (leaf == arg->p || arg->depth > 7) 1490 return -ELOOP; 1491 return check_loop(leaf, arg->p, arg->depth + 1); 1492 } 1493 return 0; 1494 } 1495 1496 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = { 1497 [TCA_KIND] = { .type = NLA_STRING }, 1498 [TCA_RATE] = { .type = NLA_BINARY, 1499 .len = sizeof(struct tc_estimator) }, 1500 [TCA_STAB] = { .type = NLA_NESTED }, 1501 [TCA_DUMP_INVISIBLE] = { .type = NLA_FLAG }, 1502 [TCA_CHAIN] = { .type = NLA_U32 }, 1503 [TCA_INGRESS_BLOCK] = { .type = NLA_U32 }, 1504 [TCA_EGRESS_BLOCK] = { .type = NLA_U32 }, 1505 }; 1506 1507 /* 1508 * Delete/get qdisc. 1509 */ 1510 1511 static int __tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, 1512 struct netlink_ext_ack *extack, 1513 struct net_device *dev, 1514 struct nlattr *tca[TCA_MAX + 1], 1515 struct tcmsg *tcm) 1516 { 1517 struct net *net = sock_net(skb->sk); 1518 struct Qdisc *q = NULL; 1519 struct Qdisc *p = NULL; 1520 u32 clid; 1521 int err; 1522 1523 clid = tcm->tcm_parent; 1524 if (clid) { 1525 if (clid != TC_H_ROOT) { 1526 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) { 1527 p = qdisc_lookup(dev, TC_H_MAJ(clid)); 1528 if (!p) { 1529 NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid"); 1530 return -ENOENT; 1531 } 1532 q = qdisc_leaf(p, clid); 1533 } else if (dev_ingress_queue(dev)) { 1534 q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping); 1535 } 1536 } else { 1537 q = rtnl_dereference(dev->qdisc); 1538 } 1539 if (!q) { 1540 NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device"); 1541 return -ENOENT; 1542 } 1543 1544 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) { 1545 NL_SET_ERR_MSG(extack, "Invalid handle"); 1546 return -EINVAL; 1547 } 1548 } else { 1549 q = qdisc_lookup(dev, tcm->tcm_handle); 1550 if (!q) { 1551 NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle"); 1552 return -ENOENT; 1553 } 1554 } 1555 1556 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { 1557 NL_SET_ERR_MSG(extack, "Invalid qdisc name: must match existing qdisc"); 1558 return -EINVAL; 1559 } 1560 1561 if (n->nlmsg_type == RTM_DELQDISC) { 1562 if (!clid) { 1563 NL_SET_ERR_MSG(extack, "Classid cannot be zero"); 1564 return -EINVAL; 1565 } 1566 if (q->handle == 0) { 1567 NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero"); 1568 return -ENOENT; 1569 } 1570 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack); 1571 if (err != 0) 1572 return err; 1573 } else { 1574 qdisc_get_notify(net, skb, n, clid, q, NULL); 1575 } 1576 return 0; 1577 } 1578 1579 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, 1580 struct netlink_ext_ack *extack) 1581 { 1582 struct net *net = sock_net(skb->sk); 1583 struct tcmsg *tcm = nlmsg_data(n); 1584 struct nlattr *tca[TCA_MAX + 1]; 1585 struct net_device *dev; 1586 int err; 1587 1588 err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, 1589 rtm_tca_policy, extack); 1590 if (err < 0) 1591 return err; 1592 1593 dev = __dev_get_by_index(net, tcm->tcm_ifindex); 1594 if (!dev) 1595 return -ENODEV; 1596 1597 netdev_lock_ops(dev); 1598 err = __tc_get_qdisc(skb, n, extack, dev, tca, tcm); 1599 netdev_unlock_ops(dev); 1600 1601 return err; 1602 } 1603 1604 static bool req_create_or_replace(struct nlmsghdr *n) 1605 { 1606 return (n->nlmsg_flags & NLM_F_CREATE && 1607 n->nlmsg_flags & NLM_F_REPLACE); 1608 } 1609 1610 static bool req_create_exclusive(struct nlmsghdr *n) 1611 { 1612 return (n->nlmsg_flags & NLM_F_CREATE && 1613 n->nlmsg_flags & NLM_F_EXCL); 1614 } 1615 1616 static bool req_change(struct nlmsghdr *n) 1617 { 1618 return (!(n->nlmsg_flags & NLM_F_CREATE) && 1619 !(n->nlmsg_flags & NLM_F_REPLACE) && 1620 !(n->nlmsg_flags & NLM_F_EXCL)); 1621 } 1622 1623 static int __tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, 1624 struct netlink_ext_ack *extack, 1625 struct net_device *dev, 1626 struct nlattr *tca[TCA_MAX + 1], 1627 struct tcmsg *tcm, 1628 bool *replay) 1629 { 1630 struct Qdisc *q = NULL; 1631 struct Qdisc *p = NULL; 1632 u32 clid; 1633 int err; 1634 1635 clid = tcm->tcm_parent; 1636 1637 if (clid) { 1638 if (clid != TC_H_ROOT) { 1639 if (clid != TC_H_INGRESS) { 1640 p = qdisc_lookup(dev, TC_H_MAJ(clid)); 1641 if (!p) { 1642 NL_SET_ERR_MSG(extack, "Failed to find specified qdisc"); 1643 return -ENOENT; 1644 } 1645 q = qdisc_leaf(p, clid); 1646 } else if (dev_ingress_queue_create(dev)) { 1647 q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping); 1648 } 1649 } else { 1650 q = rtnl_dereference(dev->qdisc); 1651 } 1652 1653 /* It may be default qdisc, ignore it */ 1654 if (q && q->handle == 0) 1655 q = NULL; 1656 1657 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) { 1658 if (tcm->tcm_handle) { 1659 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) { 1660 NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override"); 1661 return -EEXIST; 1662 } 1663 if (TC_H_MIN(tcm->tcm_handle)) { 1664 NL_SET_ERR_MSG(extack, "Invalid minor handle"); 1665 return -EINVAL; 1666 } 1667 q = qdisc_lookup(dev, tcm->tcm_handle); 1668 if (!q) 1669 goto create_n_graft; 1670 if (q->parent != tcm->tcm_parent) { 1671 NL_SET_ERR_MSG(extack, "Cannot move an existing qdisc to a different parent"); 1672 return -EINVAL; 1673 } 1674 if (n->nlmsg_flags & NLM_F_EXCL) { 1675 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override"); 1676 return -EEXIST; 1677 } 1678 if (tca[TCA_KIND] && 1679 nla_strcmp(tca[TCA_KIND], q->ops->id)) { 1680 NL_SET_ERR_MSG(extack, "Invalid qdisc name: must match existing qdisc"); 1681 return -EINVAL; 1682 } 1683 if (q->flags & TCQ_F_INGRESS) { 1684 NL_SET_ERR_MSG(extack, 1685 "Cannot regraft ingress or clsact Qdiscs"); 1686 return -EINVAL; 1687 } 1688 if (q == p || 1689 (p && check_loop(q, p, 0))) { 1690 NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected"); 1691 return -ELOOP; 1692 } 1693 if (clid == TC_H_INGRESS) { 1694 NL_SET_ERR_MSG(extack, "Ingress cannot graft directly"); 1695 return -EINVAL; 1696 } 1697 qdisc_refcount_inc(q); 1698 goto graft; 1699 } else { 1700 if (!q) 1701 goto create_n_graft; 1702 1703 /* This magic test requires explanation. 1704 * 1705 * We know, that some child q is already 1706 * attached to this parent and have choice: 1707 * 1) change it or 2) create/graft new one. 1708 * If the requested qdisc kind is different 1709 * than the existing one, then we choose graft. 1710 * If they are the same then this is "change" 1711 * operation - just let it fallthrough.. 1712 * 1713 * 1. We are allowed to create/graft only 1714 * if the request is explicitly stating 1715 * "please create if it doesn't exist". 1716 * 1717 * 2. If the request is to exclusive create 1718 * then the qdisc tcm_handle is not expected 1719 * to exist, so that we choose create/graft too. 1720 * 1721 * 3. The last case is when no flags are set. 1722 * This will happen when for example tc 1723 * utility issues a "change" command. 1724 * Alas, it is sort of hole in API, we 1725 * cannot decide what to do unambiguously. 1726 * For now we select create/graft. 1727 */ 1728 if (tca[TCA_KIND] && 1729 nla_strcmp(tca[TCA_KIND], q->ops->id)) { 1730 if (req_create_or_replace(n) || 1731 req_create_exclusive(n)) 1732 goto create_n_graft; 1733 else if (req_change(n)) 1734 goto create_n_graft2; 1735 } 1736 } 1737 } 1738 } else { 1739 if (!tcm->tcm_handle) { 1740 NL_SET_ERR_MSG(extack, "Handle cannot be zero"); 1741 return -EINVAL; 1742 } 1743 q = qdisc_lookup(dev, tcm->tcm_handle); 1744 } 1745 1746 /* Change qdisc parameters */ 1747 if (!q) { 1748 NL_SET_ERR_MSG(extack, "Specified qdisc not found"); 1749 return -ENOENT; 1750 } 1751 if (n->nlmsg_flags & NLM_F_EXCL) { 1752 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify"); 1753 return -EEXIST; 1754 } 1755 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { 1756 NL_SET_ERR_MSG(extack, "Invalid qdisc name: must match existing qdisc"); 1757 return -EINVAL; 1758 } 1759 err = qdisc_change(q, tca, extack); 1760 if (err == 0) 1761 qdisc_notify(sock_net(skb->sk), skb, n, clid, NULL, q, extack); 1762 return err; 1763 1764 create_n_graft: 1765 if (!(n->nlmsg_flags & NLM_F_CREATE)) { 1766 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag"); 1767 return -ENOENT; 1768 } 1769 create_n_graft2: 1770 if (clid == TC_H_INGRESS) { 1771 if (dev_ingress_queue(dev)) { 1772 q = qdisc_create(dev, dev_ingress_queue(dev), 1773 tcm->tcm_parent, tcm->tcm_parent, 1774 tca, &err, extack); 1775 } else { 1776 NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device"); 1777 err = -ENOENT; 1778 } 1779 } else { 1780 struct netdev_queue *dev_queue; 1781 1782 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue) 1783 dev_queue = p->ops->cl_ops->select_queue(p, tcm); 1784 else if (p) 1785 dev_queue = p->dev_queue; 1786 else 1787 dev_queue = netdev_get_tx_queue(dev, 0); 1788 1789 q = qdisc_create(dev, dev_queue, 1790 tcm->tcm_parent, tcm->tcm_handle, 1791 tca, &err, extack); 1792 } 1793 if (q == NULL) { 1794 if (err == -EAGAIN) { 1795 *replay = true; 1796 return 0; 1797 } 1798 return err; 1799 } 1800 1801 graft: 1802 err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack); 1803 if (err) { 1804 if (q) 1805 qdisc_put(q); 1806 return err; 1807 } 1808 1809 return 0; 1810 } 1811 1812 /* 1813 * Create/change qdisc. 1814 */ 1815 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, 1816 struct netlink_ext_ack *extack) 1817 { 1818 struct net *net = sock_net(skb->sk); 1819 struct nlattr *tca[TCA_MAX + 1]; 1820 struct net_device *dev; 1821 struct tcmsg *tcm; 1822 bool replay; 1823 int err; 1824 1825 replay: 1826 /* Reinit, just in case something touches this. */ 1827 err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, 1828 rtm_tca_policy, extack); 1829 if (err < 0) 1830 return err; 1831 1832 tcm = nlmsg_data(n); 1833 dev = __dev_get_by_index(net, tcm->tcm_ifindex); 1834 if (!dev) 1835 return -ENODEV; 1836 1837 replay = false; 1838 netdev_lock_ops(dev); 1839 err = __tc_modify_qdisc(skb, n, extack, dev, tca, tcm, &replay); 1840 netdev_unlock_ops(dev); 1841 if (replay) 1842 goto replay; 1843 1844 return err; 1845 } 1846 1847 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, 1848 struct netlink_callback *cb, 1849 int *q_idx_p, int s_q_idx, bool recur, 1850 bool dump_invisible) 1851 { 1852 int ret = 0, q_idx = *q_idx_p; 1853 struct Qdisc *q; 1854 int b; 1855 1856 if (!root) 1857 return 0; 1858 1859 q = root; 1860 if (q_idx < s_q_idx) { 1861 q_idx++; 1862 } else { 1863 if (!tc_qdisc_dump_ignore(q, dump_invisible) && 1864 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, 1865 cb->nlh->nlmsg_seq, NLM_F_MULTI, 1866 RTM_NEWQDISC, NULL) <= 0) 1867 goto done; 1868 q_idx++; 1869 } 1870 1871 /* If dumping singletons, there is no qdisc_dev(root) and the singleton 1872 * itself has already been dumped. 1873 * 1874 * If we've already dumped the top-level (ingress) qdisc above and the global 1875 * qdisc hashtable, we don't want to hit it again 1876 */ 1877 if (!qdisc_dev(root) || !recur) 1878 goto out; 1879 1880 hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) { 1881 if (q_idx < s_q_idx) { 1882 q_idx++; 1883 continue; 1884 } 1885 if (!tc_qdisc_dump_ignore(q, dump_invisible) && 1886 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, 1887 cb->nlh->nlmsg_seq, NLM_F_MULTI, 1888 RTM_NEWQDISC, NULL) <= 0) 1889 goto done; 1890 q_idx++; 1891 } 1892 1893 out: 1894 *q_idx_p = q_idx; 1895 return ret; 1896 done: 1897 ret = -1; 1898 goto out; 1899 } 1900 1901 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) 1902 { 1903 struct net *net = sock_net(skb->sk); 1904 int idx, q_idx; 1905 int s_idx, s_q_idx; 1906 struct net_device *dev; 1907 const struct nlmsghdr *nlh = cb->nlh; 1908 struct nlattr *tca[TCA_MAX + 1]; 1909 int err; 1910 1911 s_idx = cb->args[0]; 1912 s_q_idx = q_idx = cb->args[1]; 1913 1914 idx = 0; 1915 ASSERT_RTNL(); 1916 1917 err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX, 1918 rtm_tca_policy, cb->extack); 1919 if (err < 0) 1920 return err; 1921 1922 for_each_netdev(net, dev) { 1923 struct netdev_queue *dev_queue; 1924 1925 if (idx < s_idx) 1926 goto cont; 1927 if (idx > s_idx) 1928 s_q_idx = 0; 1929 q_idx = 0; 1930 1931 netdev_lock_ops(dev); 1932 if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc), 1933 skb, cb, &q_idx, s_q_idx, 1934 true, tca[TCA_DUMP_INVISIBLE]) < 0) { 1935 netdev_unlock_ops(dev); 1936 goto done; 1937 } 1938 1939 dev_queue = dev_ingress_queue(dev); 1940 if (dev_queue && 1941 tc_dump_qdisc_root(rtnl_dereference(dev_queue->qdisc_sleeping), 1942 skb, cb, &q_idx, s_q_idx, false, 1943 tca[TCA_DUMP_INVISIBLE]) < 0) { 1944 netdev_unlock_ops(dev); 1945 goto done; 1946 } 1947 netdev_unlock_ops(dev); 1948 1949 cont: 1950 idx++; 1951 } 1952 1953 done: 1954 cb->args[0] = idx; 1955 cb->args[1] = q_idx; 1956 1957 return skb->len; 1958 } 1959 1960 1961 1962 /************************************************ 1963 * Traffic classes manipulation. * 1964 ************************************************/ 1965 1966 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, 1967 unsigned long cl, u32 portid, u32 seq, u16 flags, 1968 int event, struct netlink_ext_ack *extack) 1969 { 1970 struct tcmsg *tcm; 1971 struct nlmsghdr *nlh; 1972 unsigned char *b = skb_tail_pointer(skb); 1973 struct gnet_dump d; 1974 const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops; 1975 1976 cond_resched(); 1977 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); 1978 if (!nlh) 1979 goto out_nlmsg_trim; 1980 tcm = nlmsg_data(nlh); 1981 tcm->tcm_family = AF_UNSPEC; 1982 tcm->tcm__pad1 = 0; 1983 tcm->tcm__pad2 = 0; 1984 tcm->tcm_ifindex = qdisc_dev(q)->ifindex; 1985 tcm->tcm_parent = q->handle; 1986 tcm->tcm_handle = q->handle; 1987 tcm->tcm_info = 0; 1988 if (nla_put_string(skb, TCA_KIND, q->ops->id)) 1989 goto nla_put_failure; 1990 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0) 1991 goto nla_put_failure; 1992 1993 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, 1994 NULL, &d, TCA_PAD) < 0) 1995 goto nla_put_failure; 1996 1997 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0) 1998 goto nla_put_failure; 1999 2000 if (gnet_stats_finish_copy(&d) < 0) 2001 goto nla_put_failure; 2002 2003 if (extack && extack->_msg && 2004 nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) 2005 goto out_nlmsg_trim; 2006 2007 nlh->nlmsg_len = skb_tail_pointer(skb) - b; 2008 2009 return skb->len; 2010 2011 out_nlmsg_trim: 2012 nla_put_failure: 2013 nlmsg_trim(skb, b); 2014 return -1; 2015 } 2016 2017 static int tclass_notify(struct net *net, struct sk_buff *oskb, 2018 struct nlmsghdr *n, struct Qdisc *q, 2019 unsigned long cl, int event, struct netlink_ext_ack *extack) 2020 { 2021 struct sk_buff *skb; 2022 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; 2023 2024 if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) 2025 return 0; 2026 2027 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 2028 if (!skb) 2029 return -ENOBUFS; 2030 2031 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event, extack) < 0) { 2032 kfree_skb(skb); 2033 return -EINVAL; 2034 } 2035 2036 return rtnetlink_send(skb, net, portid, RTNLGRP_TC, 2037 n->nlmsg_flags & NLM_F_ECHO); 2038 } 2039 2040 static int tclass_get_notify(struct net *net, struct sk_buff *oskb, 2041 struct nlmsghdr *n, struct Qdisc *q, 2042 unsigned long cl, struct netlink_ext_ack *extack) 2043 { 2044 struct sk_buff *skb; 2045 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; 2046 2047 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 2048 if (!skb) 2049 return -ENOBUFS; 2050 2051 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, RTM_NEWTCLASS, 2052 extack) < 0) { 2053 kfree_skb(skb); 2054 return -EINVAL; 2055 } 2056 2057 return rtnetlink_send(skb, net, portid, RTNLGRP_TC, 2058 n->nlmsg_flags & NLM_F_ECHO); 2059 } 2060 2061 static int tclass_del_notify(struct net *net, 2062 const struct Qdisc_class_ops *cops, 2063 struct sk_buff *oskb, struct nlmsghdr *n, 2064 struct Qdisc *q, unsigned long cl, 2065 struct netlink_ext_ack *extack) 2066 { 2067 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; 2068 struct sk_buff *skb; 2069 int err = 0; 2070 2071 if (!cops->delete) 2072 return -EOPNOTSUPP; 2073 2074 if (rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) { 2075 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 2076 if (!skb) 2077 return -ENOBUFS; 2078 2079 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, 2080 RTM_DELTCLASS, extack) < 0) { 2081 kfree_skb(skb); 2082 return -EINVAL; 2083 } 2084 } else { 2085 skb = NULL; 2086 } 2087 2088 err = cops->delete(q, cl, extack); 2089 if (err) { 2090 kfree_skb(skb); 2091 return err; 2092 } 2093 2094 err = rtnetlink_maybe_send(skb, net, portid, RTNLGRP_TC, 2095 n->nlmsg_flags & NLM_F_ECHO); 2096 return err; 2097 } 2098 2099 #ifdef CONFIG_NET_CLS 2100 2101 struct tcf_bind_args { 2102 struct tcf_walker w; 2103 unsigned long base; 2104 unsigned long cl; 2105 u32 classid; 2106 }; 2107 2108 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg) 2109 { 2110 struct tcf_bind_args *a = (void *)arg; 2111 2112 if (n && tp->ops->bind_class) { 2113 struct Qdisc *q = tcf_block_q(tp->chain->block); 2114 2115 sch_tree_lock(q); 2116 tp->ops->bind_class(n, a->classid, a->cl, q, a->base); 2117 sch_tree_unlock(q); 2118 } 2119 return 0; 2120 } 2121 2122 struct tc_bind_class_args { 2123 struct qdisc_walker w; 2124 unsigned long new_cl; 2125 u32 portid; 2126 u32 clid; 2127 }; 2128 2129 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl, 2130 struct qdisc_walker *w) 2131 { 2132 struct tc_bind_class_args *a = (struct tc_bind_class_args *)w; 2133 const struct Qdisc_class_ops *cops = q->ops->cl_ops; 2134 struct tcf_block *block; 2135 struct tcf_chain *chain; 2136 2137 block = cops->tcf_block(q, cl, NULL); 2138 if (!block) 2139 return 0; 2140 for (chain = tcf_get_next_chain(block, NULL); 2141 chain; 2142 chain = tcf_get_next_chain(block, chain)) { 2143 struct tcf_proto *tp; 2144 2145 for (tp = tcf_get_next_proto(chain, NULL); 2146 tp; tp = tcf_get_next_proto(chain, tp)) { 2147 struct tcf_bind_args arg = {}; 2148 2149 arg.w.fn = tcf_node_bind; 2150 arg.classid = a->clid; 2151 arg.base = cl; 2152 arg.cl = a->new_cl; 2153 tp->ops->walk(tp, &arg.w, true); 2154 } 2155 } 2156 2157 return 0; 2158 } 2159 2160 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, 2161 unsigned long new_cl) 2162 { 2163 const struct Qdisc_class_ops *cops = q->ops->cl_ops; 2164 struct tc_bind_class_args args = {}; 2165 2166 if (!cops->tcf_block) 2167 return; 2168 args.portid = portid; 2169 args.clid = clid; 2170 args.new_cl = new_cl; 2171 args.w.fn = tc_bind_class_walker; 2172 q->ops->cl_ops->walk(q, &args.w); 2173 } 2174 2175 #else 2176 2177 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, 2178 unsigned long new_cl) 2179 { 2180 } 2181 2182 #endif 2183 2184 static int __tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, 2185 struct netlink_ext_ack *extack, 2186 struct net_device *dev, 2187 struct nlattr *tca[TCA_MAX + 1], 2188 struct tcmsg *tcm) 2189 { 2190 struct net *net = sock_net(skb->sk); 2191 const struct Qdisc_class_ops *cops; 2192 struct Qdisc *q = NULL; 2193 unsigned long cl = 0; 2194 unsigned long new_cl; 2195 u32 portid; 2196 u32 clid; 2197 u32 qid; 2198 int err; 2199 2200 /* 2201 parent == TC_H_UNSPEC - unspecified parent. 2202 parent == TC_H_ROOT - class is root, which has no parent. 2203 parent == X:0 - parent is root class. 2204 parent == X:Y - parent is a node in hierarchy. 2205 parent == 0:Y - parent is X:Y, where X:0 is qdisc. 2206 2207 handle == 0:0 - generate handle from kernel pool. 2208 handle == 0:Y - class is X:Y, where X:0 is qdisc. 2209 handle == X:Y - clear. 2210 handle == X:0 - root class. 2211 */ 2212 2213 /* Step 1. Determine qdisc handle X:0 */ 2214 2215 portid = tcm->tcm_parent; 2216 clid = tcm->tcm_handle; 2217 qid = TC_H_MAJ(clid); 2218 2219 if (portid != TC_H_ROOT) { 2220 u32 qid1 = TC_H_MAJ(portid); 2221 2222 if (qid && qid1) { 2223 /* If both majors are known, they must be identical. */ 2224 if (qid != qid1) 2225 return -EINVAL; 2226 } else if (qid1) { 2227 qid = qid1; 2228 } else if (qid == 0) 2229 qid = rtnl_dereference(dev->qdisc)->handle; 2230 2231 /* Now qid is genuine qdisc handle consistent 2232 * both with parent and child. 2233 * 2234 * TC_H_MAJ(portid) still may be unspecified, complete it now. 2235 */ 2236 if (portid) 2237 portid = TC_H_MAKE(qid, portid); 2238 } else { 2239 if (qid == 0) 2240 qid = rtnl_dereference(dev->qdisc)->handle; 2241 } 2242 2243 /* OK. Locate qdisc */ 2244 q = qdisc_lookup(dev, qid); 2245 if (!q) 2246 return -ENOENT; 2247 2248 /* An check that it supports classes */ 2249 cops = q->ops->cl_ops; 2250 if (cops == NULL) 2251 return -EINVAL; 2252 2253 /* Now try to get class */ 2254 if (clid == 0) { 2255 if (portid == TC_H_ROOT) 2256 clid = qid; 2257 } else 2258 clid = TC_H_MAKE(qid, clid); 2259 2260 if (clid) 2261 cl = cops->find(q, clid); 2262 2263 if (cl == 0) { 2264 err = -ENOENT; 2265 if (n->nlmsg_type != RTM_NEWTCLASS || 2266 !(n->nlmsg_flags & NLM_F_CREATE)) 2267 goto out; 2268 } else { 2269 switch (n->nlmsg_type) { 2270 case RTM_NEWTCLASS: 2271 err = -EEXIST; 2272 if (n->nlmsg_flags & NLM_F_EXCL) 2273 goto out; 2274 break; 2275 case RTM_DELTCLASS: 2276 err = tclass_del_notify(net, cops, skb, n, q, cl, extack); 2277 /* Unbind the class with flilters with 0 */ 2278 tc_bind_tclass(q, portid, clid, 0); 2279 goto out; 2280 case RTM_GETTCLASS: 2281 err = tclass_get_notify(net, skb, n, q, cl, extack); 2282 goto out; 2283 default: 2284 err = -EINVAL; 2285 goto out; 2286 } 2287 } 2288 2289 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) { 2290 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes"); 2291 return -EOPNOTSUPP; 2292 } 2293 2294 /* Prevent creation of traffic classes with classid TC_H_ROOT */ 2295 if (clid == TC_H_ROOT) { 2296 NL_SET_ERR_MSG(extack, "Cannot create traffic class with classid TC_H_ROOT"); 2297 return -EINVAL; 2298 } 2299 2300 new_cl = cl; 2301 err = -EOPNOTSUPP; 2302 if (cops->change) 2303 err = cops->change(q, clid, portid, tca, &new_cl, extack); 2304 if (err == 0) { 2305 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS, extack); 2306 /* We just create a new class, need to do reverse binding. */ 2307 if (cl != new_cl) 2308 tc_bind_tclass(q, portid, clid, new_cl); 2309 } 2310 out: 2311 return err; 2312 } 2313 2314 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, 2315 struct netlink_ext_ack *extack) 2316 { 2317 struct net *net = sock_net(skb->sk); 2318 struct tcmsg *tcm = nlmsg_data(n); 2319 struct nlattr *tca[TCA_MAX + 1]; 2320 struct net_device *dev; 2321 int err; 2322 2323 err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, 2324 rtm_tca_policy, extack); 2325 if (err < 0) 2326 return err; 2327 2328 dev = __dev_get_by_index(net, tcm->tcm_ifindex); 2329 if (!dev) 2330 return -ENODEV; 2331 2332 netdev_lock_ops(dev); 2333 err = __tc_ctl_tclass(skb, n, extack, dev, tca, tcm); 2334 netdev_unlock_ops(dev); 2335 2336 return err; 2337 } 2338 2339 struct qdisc_dump_args { 2340 struct qdisc_walker w; 2341 struct sk_buff *skb; 2342 struct netlink_callback *cb; 2343 }; 2344 2345 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, 2346 struct qdisc_walker *arg) 2347 { 2348 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg; 2349 2350 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid, 2351 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, 2352 RTM_NEWTCLASS, NULL); 2353 } 2354 2355 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb, 2356 struct tcmsg *tcm, struct netlink_callback *cb, 2357 int *t_p, int s_t) 2358 { 2359 struct qdisc_dump_args arg; 2360 2361 if (tc_qdisc_dump_ignore(q, false) || 2362 *t_p < s_t || !q->ops->cl_ops || 2363 (tcm->tcm_parent && 2364 TC_H_MAJ(tcm->tcm_parent) != q->handle)) { 2365 (*t_p)++; 2366 return 0; 2367 } 2368 if (*t_p > s_t) 2369 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); 2370 arg.w.fn = qdisc_class_dump; 2371 arg.skb = skb; 2372 arg.cb = cb; 2373 arg.w.stop = 0; 2374 arg.w.skip = cb->args[1]; 2375 arg.w.count = 0; 2376 q->ops->cl_ops->walk(q, &arg.w); 2377 cb->args[1] = arg.w.count; 2378 if (arg.w.stop) 2379 return -1; 2380 (*t_p)++; 2381 return 0; 2382 } 2383 2384 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb, 2385 struct tcmsg *tcm, struct netlink_callback *cb, 2386 int *t_p, int s_t, bool recur) 2387 { 2388 struct Qdisc *q; 2389 int b; 2390 2391 if (!root) 2392 return 0; 2393 2394 if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0) 2395 return -1; 2396 2397 if (!qdisc_dev(root) || !recur) 2398 return 0; 2399 2400 if (tcm->tcm_parent) { 2401 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent)); 2402 if (q && q != root && 2403 tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0) 2404 return -1; 2405 return 0; 2406 } 2407 hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) { 2408 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0) 2409 return -1; 2410 } 2411 2412 return 0; 2413 } 2414 2415 static int __tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb, 2416 struct tcmsg *tcm, struct net_device *dev) 2417 { 2418 struct netdev_queue *dev_queue; 2419 int t, s_t; 2420 2421 s_t = cb->args[0]; 2422 t = 0; 2423 2424 if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc), 2425 skb, tcm, cb, &t, s_t, true) < 0) 2426 goto done; 2427 2428 dev_queue = dev_ingress_queue(dev); 2429 if (dev_queue && 2430 tc_dump_tclass_root(rtnl_dereference(dev_queue->qdisc_sleeping), 2431 skb, tcm, cb, &t, s_t, false) < 0) 2432 goto done; 2433 2434 done: 2435 cb->args[0] = t; 2436 2437 return skb->len; 2438 } 2439 2440 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) 2441 { 2442 struct tcmsg *tcm = nlmsg_data(cb->nlh); 2443 struct net *net = sock_net(skb->sk); 2444 struct net_device *dev; 2445 int err; 2446 2447 if (nlmsg_len(cb->nlh) < sizeof(*tcm)) 2448 return 0; 2449 2450 dev = dev_get_by_index(net, tcm->tcm_ifindex); 2451 if (!dev) 2452 return 0; 2453 2454 netdev_lock_ops(dev); 2455 err = __tc_dump_tclass(skb, cb, tcm, dev); 2456 netdev_unlock_ops(dev); 2457 2458 dev_put(dev); 2459 2460 return err; 2461 } 2462 2463 #ifdef CONFIG_PROC_FS 2464 static int psched_show(struct seq_file *seq, void *v) 2465 { 2466 seq_printf(seq, "%08x %08x %08x %08x\n", 2467 (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1), 2468 1000000, 2469 (u32)NSEC_PER_SEC / hrtimer_resolution); 2470 2471 return 0; 2472 } 2473 2474 static int __net_init psched_net_init(struct net *net) 2475 { 2476 struct proc_dir_entry *e; 2477 2478 e = proc_create_single("psched", 0, net->proc_net, psched_show); 2479 if (e == NULL) 2480 return -ENOMEM; 2481 2482 return 0; 2483 } 2484 2485 static void __net_exit psched_net_exit(struct net *net) 2486 { 2487 remove_proc_entry("psched", net->proc_net); 2488 } 2489 #else 2490 static int __net_init psched_net_init(struct net *net) 2491 { 2492 return 0; 2493 } 2494 2495 static void __net_exit psched_net_exit(struct net *net) 2496 { 2497 } 2498 #endif 2499 2500 static struct pernet_operations psched_net_ops = { 2501 .init = psched_net_init, 2502 .exit = psched_net_exit, 2503 }; 2504 2505 #if IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) 2506 DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper); 2507 #endif 2508 2509 static const struct rtnl_msg_handler psched_rtnl_msg_handlers[] __initconst = { 2510 {.msgtype = RTM_NEWQDISC, .doit = tc_modify_qdisc}, 2511 {.msgtype = RTM_DELQDISC, .doit = tc_get_qdisc}, 2512 {.msgtype = RTM_GETQDISC, .doit = tc_get_qdisc, 2513 .dumpit = tc_dump_qdisc}, 2514 {.msgtype = RTM_NEWTCLASS, .doit = tc_ctl_tclass}, 2515 {.msgtype = RTM_DELTCLASS, .doit = tc_ctl_tclass}, 2516 {.msgtype = RTM_GETTCLASS, .doit = tc_ctl_tclass, 2517 .dumpit = tc_dump_tclass}, 2518 }; 2519 2520 static int __init pktsched_init(void) 2521 { 2522 int err; 2523 2524 err = register_pernet_subsys(&psched_net_ops); 2525 if (err) { 2526 pr_err("pktsched_init: " 2527 "cannot initialize per netns operations\n"); 2528 return err; 2529 } 2530 2531 register_qdisc(&pfifo_fast_ops); 2532 register_qdisc(&pfifo_qdisc_ops); 2533 register_qdisc(&bfifo_qdisc_ops); 2534 register_qdisc(&pfifo_head_drop_qdisc_ops); 2535 register_qdisc(&mq_qdisc_ops); 2536 register_qdisc(&noqueue_qdisc_ops); 2537 2538 rtnl_register_many(psched_rtnl_msg_handlers); 2539 2540 tc_wrapper_init(); 2541 2542 return 0; 2543 } 2544 2545 subsys_initcall(pktsched_init); 2546