1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * net/sched/sch_api.c Packet scheduler API. 4 * 5 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 6 * 7 * Fixes: 8 * 9 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired. 10 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support 11 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support 12 */ 13 14 #include <linux/module.h> 15 #include <linux/types.h> 16 #include <linux/kernel.h> 17 #include <linux/string.h> 18 #include <linux/errno.h> 19 #include <linux/skbuff.h> 20 #include <linux/init.h> 21 #include <linux/proc_fs.h> 22 #include <linux/seq_file.h> 23 #include <linux/kmod.h> 24 #include <linux/list.h> 25 #include <linux/hrtimer.h> 26 #include <linux/slab.h> 27 #include <linux/hashtable.h> 28 #include <linux/bpf.h> 29 30 #include <net/netdev_lock.h> 31 #include <net/net_namespace.h> 32 #include <net/sock.h> 33 #include <net/netlink.h> 34 #include <net/pkt_sched.h> 35 #include <net/pkt_cls.h> 36 #include <net/tc_wrapper.h> 37 38 #include <trace/events/qdisc.h> 39 40 /* 41 42 Short review. 43 ------------- 44 45 This file consists of two interrelated parts: 46 47 1. queueing disciplines manager frontend. 48 2. traffic classes manager frontend. 49 50 Generally, queueing discipline ("qdisc") is a black box, 51 which is able to enqueue packets and to dequeue them (when 52 device is ready to send something) in order and at times 53 determined by algorithm hidden in it. 54 55 qdisc's are divided to two categories: 56 - "queues", which have no internal structure visible from outside. 57 - "schedulers", which split all the packets to "traffic classes", 58 using "packet classifiers" (look at cls_api.c) 59 60 In turn, classes may have child qdiscs (as rule, queues) 61 attached to them etc. etc. etc. 62 63 The goal of the routines in this file is to translate 64 information supplied by user in the form of handles 65 to more intelligible for kernel form, to make some sanity 66 checks and part of work, which is common to all qdiscs 67 and to provide rtnetlink notifications. 68 69 All real intelligent work is done inside qdisc modules. 70 71 72 73 Every discipline has two major routines: enqueue and dequeue. 74 75 ---dequeue 76 77 dequeue usually returns a skb to send. It is allowed to return NULL, 78 but it does not mean that queue is empty, it just means that 79 discipline does not want to send anything this time. 80 Queue is really empty if q->q.qlen == 0. 81 For complicated disciplines with multiple queues q->q is not 82 real packet queue, but however q->q.qlen must be valid. 83 84 ---enqueue 85 86 enqueue returns 0, if packet was enqueued successfully. 87 If packet (this one or another one) was dropped, it returns 88 not zero error code. 89 NET_XMIT_DROP - this packet dropped 90 Expected action: do not backoff, but wait until queue will clear. 91 NET_XMIT_CN - probably this packet enqueued, but another one dropped. 92 Expected action: backoff or ignore 93 94 Auxiliary routines: 95 96 ---peek 97 98 like dequeue but without removing a packet from the queue 99 100 ---reset 101 102 returns qdisc to initial state: purge all buffers, clear all 103 timers, counters (except for statistics) etc. 104 105 ---init 106 107 initializes newly created qdisc. 108 109 ---destroy 110 111 destroys resources allocated by init and during lifetime of qdisc. 112 113 ---change 114 115 changes qdisc parameters. 116 */ 117 118 /* Protects list of registered TC modules. It is pure SMP lock. */ 119 static DEFINE_RWLOCK(qdisc_mod_lock); 120 121 122 /************************************************ 123 * Queueing disciplines manipulation. * 124 ************************************************/ 125 126 127 /* The list of all installed queueing disciplines. */ 128 129 static struct Qdisc_ops *qdisc_base; 130 131 /* Register/unregister queueing discipline */ 132 133 int register_qdisc(struct Qdisc_ops *qops) 134 { 135 struct Qdisc_ops *q, **qp; 136 int rc = -EEXIST; 137 138 write_lock(&qdisc_mod_lock); 139 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next) 140 if (!strcmp(qops->id, q->id)) 141 goto out; 142 143 if (qops->enqueue == NULL) 144 qops->enqueue = noop_qdisc_ops.enqueue; 145 if (qops->peek == NULL) { 146 if (qops->dequeue == NULL) 147 qops->peek = noop_qdisc_ops.peek; 148 else 149 goto out_einval; 150 } 151 if (qops->dequeue == NULL) 152 qops->dequeue = noop_qdisc_ops.dequeue; 153 154 if (qops->cl_ops) { 155 const struct Qdisc_class_ops *cops = qops->cl_ops; 156 157 if (!(cops->find && cops->walk && cops->leaf)) 158 goto out_einval; 159 160 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf)) 161 goto out_einval; 162 } 163 164 qops->next = NULL; 165 *qp = qops; 166 rc = 0; 167 out: 168 write_unlock(&qdisc_mod_lock); 169 return rc; 170 171 out_einval: 172 rc = -EINVAL; 173 goto out; 174 } 175 EXPORT_SYMBOL(register_qdisc); 176 177 void unregister_qdisc(struct Qdisc_ops *qops) 178 { 179 struct Qdisc_ops *q, **qp; 180 int err = -ENOENT; 181 182 write_lock(&qdisc_mod_lock); 183 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next) 184 if (q == qops) 185 break; 186 if (q) { 187 *qp = q->next; 188 q->next = NULL; 189 err = 0; 190 } 191 write_unlock(&qdisc_mod_lock); 192 193 WARN(err, "unregister qdisc(%s) failed\n", qops->id); 194 } 195 EXPORT_SYMBOL(unregister_qdisc); 196 197 /* Get default qdisc if not otherwise specified */ 198 void qdisc_get_default(char *name, size_t len) 199 { 200 read_lock(&qdisc_mod_lock); 201 strscpy(name, default_qdisc_ops->id, len); 202 read_unlock(&qdisc_mod_lock); 203 } 204 205 static struct Qdisc_ops *qdisc_lookup_default(const char *name) 206 { 207 struct Qdisc_ops *q = NULL; 208 209 for (q = qdisc_base; q; q = q->next) { 210 if (!strcmp(name, q->id)) { 211 if (!bpf_try_module_get(q, q->owner)) 212 q = NULL; 213 break; 214 } 215 } 216 217 return q; 218 } 219 220 /* Set new default qdisc to use */ 221 int qdisc_set_default(const char *name) 222 { 223 const struct Qdisc_ops *ops; 224 225 if (!capable(CAP_NET_ADMIN)) 226 return -EPERM; 227 228 write_lock(&qdisc_mod_lock); 229 ops = qdisc_lookup_default(name); 230 if (!ops) { 231 /* Not found, drop lock and try to load module */ 232 write_unlock(&qdisc_mod_lock); 233 request_module(NET_SCH_ALIAS_PREFIX "%s", name); 234 write_lock(&qdisc_mod_lock); 235 236 ops = qdisc_lookup_default(name); 237 } 238 239 if (ops) { 240 /* Set new default */ 241 bpf_module_put(default_qdisc_ops, default_qdisc_ops->owner); 242 default_qdisc_ops = ops; 243 } 244 write_unlock(&qdisc_mod_lock); 245 246 return ops ? 0 : -ENOENT; 247 } 248 249 #ifdef CONFIG_NET_SCH_DEFAULT 250 /* Set default value from kernel config */ 251 static int __init sch_default_qdisc(void) 252 { 253 return qdisc_set_default(CONFIG_DEFAULT_NET_SCH); 254 } 255 late_initcall(sch_default_qdisc); 256 #endif 257 258 /* We know handle. Find qdisc among all qdisc's attached to device 259 * (root qdisc, all its children, children of children etc.) 260 * Note: caller either uses rtnl or rcu_read_lock() 261 */ 262 263 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) 264 { 265 struct Qdisc *q; 266 267 if (!qdisc_dev(root)) 268 return (root->handle == handle ? root : NULL); 269 270 if (!(root->flags & TCQ_F_BUILTIN) && 271 root->handle == handle) 272 return root; 273 274 hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle, 275 lockdep_rtnl_is_held()) { 276 if (q->handle == handle) 277 return q; 278 } 279 return NULL; 280 } 281 282 void qdisc_hash_add(struct Qdisc *q, bool invisible) 283 { 284 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { 285 ASSERT_RTNL(); 286 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle); 287 if (invisible) 288 q->flags |= TCQ_F_INVISIBLE; 289 } 290 } 291 EXPORT_SYMBOL(qdisc_hash_add); 292 293 void qdisc_hash_del(struct Qdisc *q) 294 { 295 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { 296 ASSERT_RTNL(); 297 hash_del_rcu(&q->hash); 298 } 299 } 300 EXPORT_SYMBOL(qdisc_hash_del); 301 302 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) 303 { 304 struct Qdisc *q; 305 306 if (!handle) 307 return NULL; 308 q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle); 309 if (q) 310 goto out; 311 312 if (dev_ingress_queue(dev)) 313 q = qdisc_match_from_root( 314 rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping), 315 handle); 316 out: 317 return q; 318 } 319 320 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle) 321 { 322 struct netdev_queue *nq; 323 struct Qdisc *q; 324 325 if (!handle) 326 return NULL; 327 q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle); 328 if (q) 329 goto out; 330 331 nq = dev_ingress_queue_rcu(dev); 332 if (nq) 333 q = qdisc_match_from_root(rcu_dereference(nq->qdisc_sleeping), 334 handle); 335 out: 336 return q; 337 } 338 339 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid, 340 struct netlink_ext_ack *extack) 341 { 342 unsigned long cl; 343 const struct Qdisc_class_ops *cops = p->ops->cl_ops; 344 345 if (cops == NULL) { 346 NL_SET_ERR_MSG(extack, "Parent qdisc is not classful"); 347 return ERR_PTR(-EOPNOTSUPP); 348 } 349 cl = cops->find(p, classid); 350 351 if (cl == 0) { 352 NL_SET_ERR_MSG(extack, "Specified class not found"); 353 return ERR_PTR(-ENOENT); 354 } 355 return cops->leaf(p, cl); 356 } 357 358 /* Find queueing discipline by name */ 359 360 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind) 361 { 362 struct Qdisc_ops *q = NULL; 363 364 if (kind) { 365 read_lock(&qdisc_mod_lock); 366 for (q = qdisc_base; q; q = q->next) { 367 if (nla_strcmp(kind, q->id) == 0) { 368 if (!bpf_try_module_get(q, q->owner)) 369 q = NULL; 370 break; 371 } 372 } 373 read_unlock(&qdisc_mod_lock); 374 } 375 return q; 376 } 377 378 /* The linklayer setting were not transferred from iproute2, in older 379 * versions, and the rate tables lookup systems have been dropped in 380 * the kernel. To keep backward compatible with older iproute2 tc 381 * utils, we detect the linklayer setting by detecting if the rate 382 * table were modified. 383 * 384 * For linklayer ATM table entries, the rate table will be aligned to 385 * 48 bytes, thus some table entries will contain the same value. The 386 * mpu (min packet unit) is also encoded into the old rate table, thus 387 * starting from the mpu, we find low and high table entries for 388 * mapping this cell. If these entries contain the same value, when 389 * the rate tables have been modified for linklayer ATM. 390 * 391 * This is done by rounding mpu to the nearest 48 bytes cell/entry, 392 * and then roundup to the next cell, calc the table entry one below, 393 * and compare. 394 */ 395 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab) 396 { 397 int low = roundup(r->mpu, 48); 398 int high = roundup(low+1, 48); 399 int cell_low = low >> r->cell_log; 400 int cell_high = (high >> r->cell_log) - 1; 401 402 /* rtab is too inaccurate at rates > 100Mbit/s */ 403 if ((r->rate > (100000000/8)) || (rtab[0] == 0)) { 404 pr_debug("TC linklayer: Giving up ATM detection\n"); 405 return TC_LINKLAYER_ETHERNET; 406 } 407 408 if ((cell_high > cell_low) && (cell_high < 256) 409 && (rtab[cell_low] == rtab[cell_high])) { 410 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n", 411 cell_low, cell_high, rtab[cell_high]); 412 return TC_LINKLAYER_ATM; 413 } 414 return TC_LINKLAYER_ETHERNET; 415 } 416 417 static struct qdisc_rate_table *qdisc_rtab_list; 418 419 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, 420 struct nlattr *tab, 421 struct netlink_ext_ack *extack) 422 { 423 struct qdisc_rate_table *rtab; 424 425 if (tab == NULL || r->rate == 0 || 426 r->cell_log == 0 || r->cell_log >= 32 || 427 nla_len(tab) != TC_RTAB_SIZE) { 428 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching"); 429 return NULL; 430 } 431 432 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) { 433 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) && 434 !memcmp(&rtab->data, nla_data(tab), TC_RTAB_SIZE)) { 435 rtab->refcnt++; 436 return rtab; 437 } 438 } 439 440 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL); 441 if (rtab) { 442 rtab->rate = *r; 443 rtab->refcnt = 1; 444 memcpy(rtab->data, nla_data(tab), TC_RTAB_SIZE); 445 if (r->linklayer == TC_LINKLAYER_UNAWARE) 446 r->linklayer = __detect_linklayer(r, rtab->data); 447 rtab->next = qdisc_rtab_list; 448 qdisc_rtab_list = rtab; 449 } else { 450 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table"); 451 } 452 return rtab; 453 } 454 EXPORT_SYMBOL(qdisc_get_rtab); 455 456 void qdisc_put_rtab(struct qdisc_rate_table *tab) 457 { 458 struct qdisc_rate_table *rtab, **rtabp; 459 460 if (!tab || --tab->refcnt) 461 return; 462 463 for (rtabp = &qdisc_rtab_list; 464 (rtab = *rtabp) != NULL; 465 rtabp = &rtab->next) { 466 if (rtab == tab) { 467 *rtabp = rtab->next; 468 kfree(rtab); 469 return; 470 } 471 } 472 } 473 EXPORT_SYMBOL(qdisc_put_rtab); 474 475 static LIST_HEAD(qdisc_stab_list); 476 477 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = { 478 [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) }, 479 [TCA_STAB_DATA] = { .type = NLA_BINARY }, 480 }; 481 482 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt, 483 struct netlink_ext_ack *extack) 484 { 485 struct nlattr *tb[TCA_STAB_MAX + 1]; 486 struct qdisc_size_table *stab; 487 struct tc_sizespec *s; 488 unsigned int tsize = 0; 489 u16 *tab = NULL; 490 int err; 491 492 err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy, 493 extack); 494 if (err < 0) 495 return ERR_PTR(err); 496 if (!tb[TCA_STAB_BASE]) { 497 NL_SET_ERR_MSG(extack, "Size table base attribute is missing"); 498 return ERR_PTR(-EINVAL); 499 } 500 501 s = nla_data(tb[TCA_STAB_BASE]); 502 503 if (s->tsize > 0) { 504 if (!tb[TCA_STAB_DATA]) { 505 NL_SET_ERR_MSG(extack, "Size table data attribute is missing"); 506 return ERR_PTR(-EINVAL); 507 } 508 tab = nla_data(tb[TCA_STAB_DATA]); 509 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16); 510 } 511 512 if (tsize != s->tsize || (!tab && tsize > 0)) { 513 NL_SET_ERR_MSG(extack, "Invalid size of size table"); 514 return ERR_PTR(-EINVAL); 515 } 516 517 list_for_each_entry(stab, &qdisc_stab_list, list) { 518 if (memcmp(&stab->szopts, s, sizeof(*s))) 519 continue; 520 if (tsize > 0 && 521 memcmp(stab->data, tab, flex_array_size(stab, data, tsize))) 522 continue; 523 stab->refcnt++; 524 return stab; 525 } 526 527 if (s->size_log > STAB_SIZE_LOG_MAX || 528 s->cell_log > STAB_SIZE_LOG_MAX) { 529 NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table"); 530 return ERR_PTR(-EINVAL); 531 } 532 533 stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL); 534 if (!stab) 535 return ERR_PTR(-ENOMEM); 536 537 stab->refcnt = 1; 538 stab->szopts = *s; 539 if (tsize > 0) 540 memcpy(stab->data, tab, flex_array_size(stab, data, tsize)); 541 542 list_add_tail(&stab->list, &qdisc_stab_list); 543 544 return stab; 545 } 546 547 void qdisc_put_stab(struct qdisc_size_table *tab) 548 { 549 if (!tab) 550 return; 551 552 if (--tab->refcnt == 0) { 553 list_del(&tab->list); 554 kfree_rcu(tab, rcu); 555 } 556 } 557 EXPORT_SYMBOL(qdisc_put_stab); 558 559 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab) 560 { 561 struct nlattr *nest; 562 563 nest = nla_nest_start_noflag(skb, TCA_STAB); 564 if (nest == NULL) 565 goto nla_put_failure; 566 if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts)) 567 goto nla_put_failure; 568 nla_nest_end(skb, nest); 569 570 return skb->len; 571 572 nla_put_failure: 573 return -1; 574 } 575 576 void __qdisc_calculate_pkt_len(struct sk_buff *skb, 577 const struct qdisc_size_table *stab) 578 { 579 int pkt_len, slot; 580 581 pkt_len = skb->len + stab->szopts.overhead; 582 if (unlikely(!stab->szopts.tsize)) 583 goto out; 584 585 slot = pkt_len + stab->szopts.cell_align; 586 if (unlikely(slot < 0)) 587 slot = 0; 588 589 slot >>= stab->szopts.cell_log; 590 if (likely(slot < stab->szopts.tsize)) 591 pkt_len = stab->data[slot]; 592 else 593 pkt_len = stab->data[stab->szopts.tsize - 1] * 594 (slot / stab->szopts.tsize) + 595 stab->data[slot % stab->szopts.tsize]; 596 597 pkt_len <<= stab->szopts.size_log; 598 out: 599 if (unlikely(pkt_len < 1)) 600 pkt_len = 1; 601 qdisc_skb_cb(skb)->pkt_len = pkt_len; 602 } 603 604 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) 605 { 606 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog, 607 timer); 608 609 rcu_read_lock(); 610 __netif_schedule(qdisc_root(wd->qdisc)); 611 rcu_read_unlock(); 612 613 return HRTIMER_NORESTART; 614 } 615 616 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, 617 clockid_t clockid) 618 { 619 hrtimer_setup(&wd->timer, qdisc_watchdog, clockid, HRTIMER_MODE_ABS_PINNED); 620 wd->qdisc = qdisc; 621 } 622 EXPORT_SYMBOL(qdisc_watchdog_init_clockid); 623 624 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) 625 { 626 qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC); 627 } 628 EXPORT_SYMBOL(qdisc_watchdog_init); 629 630 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires, 631 u64 delta_ns) 632 { 633 bool deactivated; 634 635 rcu_read_lock(); 636 deactivated = test_bit(__QDISC_STATE_DEACTIVATED, 637 &qdisc_root_sleeping(wd->qdisc)->state); 638 rcu_read_unlock(); 639 if (deactivated) 640 return; 641 642 if (hrtimer_is_queued(&wd->timer)) { 643 u64 softexpires; 644 645 softexpires = ktime_to_ns(hrtimer_get_softexpires(&wd->timer)); 646 /* If timer is already set in [expires, expires + delta_ns], 647 * do not reprogram it. 648 */ 649 if (softexpires - expires <= delta_ns) 650 return; 651 } 652 653 hrtimer_start_range_ns(&wd->timer, 654 ns_to_ktime(expires), 655 delta_ns, 656 HRTIMER_MODE_ABS_PINNED); 657 } 658 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns); 659 660 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd) 661 { 662 hrtimer_cancel(&wd->timer); 663 } 664 EXPORT_SYMBOL(qdisc_watchdog_cancel); 665 666 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n) 667 { 668 struct hlist_head *h; 669 unsigned int i; 670 671 h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL); 672 673 if (h != NULL) { 674 for (i = 0; i < n; i++) 675 INIT_HLIST_HEAD(&h[i]); 676 } 677 return h; 678 } 679 680 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash) 681 { 682 struct Qdisc_class_common *cl; 683 struct hlist_node *next; 684 struct hlist_head *nhash, *ohash; 685 unsigned int nsize, nmask, osize; 686 unsigned int i, h; 687 688 /* Rehash when load factor exceeds 0.75 */ 689 if (clhash->hashelems * 4 <= clhash->hashsize * 3) 690 return; 691 nsize = clhash->hashsize * 2; 692 nmask = nsize - 1; 693 nhash = qdisc_class_hash_alloc(nsize); 694 if (nhash == NULL) 695 return; 696 697 ohash = clhash->hash; 698 osize = clhash->hashsize; 699 700 sch_tree_lock(sch); 701 for (i = 0; i < osize; i++) { 702 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) { 703 h = qdisc_class_hash(cl->classid, nmask); 704 hlist_add_head(&cl->hnode, &nhash[h]); 705 } 706 } 707 clhash->hash = nhash; 708 clhash->hashsize = nsize; 709 clhash->hashmask = nmask; 710 sch_tree_unlock(sch); 711 712 kvfree(ohash); 713 } 714 EXPORT_SYMBOL(qdisc_class_hash_grow); 715 716 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash) 717 { 718 unsigned int size = 4; 719 720 clhash->hash = qdisc_class_hash_alloc(size); 721 if (!clhash->hash) 722 return -ENOMEM; 723 clhash->hashsize = size; 724 clhash->hashmask = size - 1; 725 clhash->hashelems = 0; 726 return 0; 727 } 728 EXPORT_SYMBOL(qdisc_class_hash_init); 729 730 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash) 731 { 732 kvfree(clhash->hash); 733 } 734 EXPORT_SYMBOL(qdisc_class_hash_destroy); 735 736 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash, 737 struct Qdisc_class_common *cl) 738 { 739 unsigned int h; 740 741 INIT_HLIST_NODE(&cl->hnode); 742 h = qdisc_class_hash(cl->classid, clhash->hashmask); 743 hlist_add_head(&cl->hnode, &clhash->hash[h]); 744 clhash->hashelems++; 745 } 746 EXPORT_SYMBOL(qdisc_class_hash_insert); 747 748 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash, 749 struct Qdisc_class_common *cl) 750 { 751 hlist_del(&cl->hnode); 752 clhash->hashelems--; 753 } 754 EXPORT_SYMBOL(qdisc_class_hash_remove); 755 756 /* Allocate an unique handle from space managed by kernel 757 * Possible range is [8000-FFFF]:0000 (0x8000 values) 758 */ 759 static u32 qdisc_alloc_handle(struct net_device *dev) 760 { 761 int i = 0x8000; 762 static u32 autohandle = TC_H_MAKE(0x80000000U, 0); 763 764 do { 765 autohandle += TC_H_MAKE(0x10000U, 0); 766 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0)) 767 autohandle = TC_H_MAKE(0x80000000U, 0); 768 if (!qdisc_lookup(dev, autohandle)) 769 return autohandle; 770 cond_resched(); 771 } while (--i > 0); 772 773 return 0; 774 } 775 776 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len) 777 { 778 const struct Qdisc_class_ops *cops; 779 unsigned long cl; 780 u32 parentid; 781 bool notify; 782 int drops; 783 784 drops = max_t(int, n, 0); 785 rcu_read_lock(); 786 while ((parentid = sch->parent)) { 787 if (parentid == TC_H_ROOT) 788 break; 789 790 if (sch->flags & TCQ_F_NOPARENT) 791 break; 792 /* Notify parent qdisc only if child qdisc becomes empty. */ 793 notify = !sch->q.qlen; 794 /* TODO: perform the search on a per txq basis */ 795 sch = qdisc_lookup_rcu(qdisc_dev(sch), TC_H_MAJ(parentid)); 796 if (sch == NULL) { 797 WARN_ON_ONCE(parentid != TC_H_ROOT); 798 break; 799 } 800 cops = sch->ops->cl_ops; 801 if (notify && cops->qlen_notify) { 802 /* Note that qlen_notify must be idempotent as it may get called 803 * multiple times. 804 */ 805 cl = cops->find(sch, parentid); 806 cops->qlen_notify(sch, cl); 807 } 808 sch->q.qlen -= n; 809 sch->qstats.backlog -= len; 810 __qdisc_qstats_drop(sch, drops); 811 } 812 rcu_read_unlock(); 813 } 814 EXPORT_SYMBOL(qdisc_tree_reduce_backlog); 815 816 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type, 817 void *type_data) 818 { 819 struct net_device *dev = qdisc_dev(sch); 820 int err; 821 822 sch->flags &= ~TCQ_F_OFFLOADED; 823 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) 824 return 0; 825 826 err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data); 827 if (err == -EOPNOTSUPP) 828 return 0; 829 830 if (!err) 831 sch->flags |= TCQ_F_OFFLOADED; 832 833 return err; 834 } 835 EXPORT_SYMBOL(qdisc_offload_dump_helper); 836 837 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch, 838 struct Qdisc *new, struct Qdisc *old, 839 enum tc_setup_type type, void *type_data, 840 struct netlink_ext_ack *extack) 841 { 842 bool any_qdisc_is_offloaded; 843 int err; 844 845 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) 846 return; 847 848 err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data); 849 850 /* Don't report error if the graft is part of destroy operation. */ 851 if (!err || !new || new == &noop_qdisc) 852 return; 853 854 /* Don't report error if the parent, the old child and the new 855 * one are not offloaded. 856 */ 857 any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED; 858 any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED; 859 any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED; 860 861 if (any_qdisc_is_offloaded) 862 NL_SET_ERR_MSG(extack, "Offloading graft operation failed."); 863 } 864 EXPORT_SYMBOL(qdisc_offload_graft_helper); 865 866 void qdisc_offload_query_caps(struct net_device *dev, 867 enum tc_setup_type type, 868 void *caps, size_t caps_len) 869 { 870 const struct net_device_ops *ops = dev->netdev_ops; 871 struct tc_query_caps_base base = { 872 .type = type, 873 .caps = caps, 874 }; 875 876 memset(caps, 0, caps_len); 877 878 if (ops->ndo_setup_tc) 879 ops->ndo_setup_tc(dev, TC_QUERY_CAPS, &base); 880 } 881 EXPORT_SYMBOL(qdisc_offload_query_caps); 882 883 static void qdisc_offload_graft_root(struct net_device *dev, 884 struct Qdisc *new, struct Qdisc *old, 885 struct netlink_ext_ack *extack) 886 { 887 struct tc_root_qopt_offload graft_offload = { 888 .command = TC_ROOT_GRAFT, 889 .handle = new ? new->handle : 0, 890 .ingress = (new && new->flags & TCQ_F_INGRESS) || 891 (old && old->flags & TCQ_F_INGRESS), 892 }; 893 894 qdisc_offload_graft_helper(dev, NULL, new, old, 895 TC_SETUP_ROOT_QDISC, &graft_offload, extack); 896 } 897 898 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, 899 u32 portid, u32 seq, u16 flags, int event, 900 struct netlink_ext_ack *extack) 901 { 902 struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL; 903 struct gnet_stats_queue __percpu *cpu_qstats = NULL; 904 struct tcmsg *tcm; 905 struct nlmsghdr *nlh; 906 unsigned char *b = skb_tail_pointer(skb); 907 struct gnet_dump d; 908 struct qdisc_size_table *stab; 909 u32 block_index; 910 __u32 qlen; 911 912 cond_resched(); 913 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); 914 if (!nlh) 915 goto out_nlmsg_trim; 916 tcm = nlmsg_data(nlh); 917 tcm->tcm_family = AF_UNSPEC; 918 tcm->tcm__pad1 = 0; 919 tcm->tcm__pad2 = 0; 920 tcm->tcm_ifindex = qdisc_dev(q)->ifindex; 921 tcm->tcm_parent = clid; 922 tcm->tcm_handle = q->handle; 923 tcm->tcm_info = refcount_read(&q->refcnt); 924 if (nla_put_string(skb, TCA_KIND, q->ops->id)) 925 goto nla_put_failure; 926 if (q->ops->ingress_block_get) { 927 block_index = q->ops->ingress_block_get(q); 928 if (block_index && 929 nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index)) 930 goto nla_put_failure; 931 } 932 if (q->ops->egress_block_get) { 933 block_index = q->ops->egress_block_get(q); 934 if (block_index && 935 nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index)) 936 goto nla_put_failure; 937 } 938 if (q->ops->dump && q->ops->dump(q, skb) < 0) 939 goto nla_put_failure; 940 if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED))) 941 goto nla_put_failure; 942 qlen = qdisc_qlen_sum(q); 943 944 stab = rtnl_dereference(q->stab); 945 if (stab && qdisc_dump_stab(skb, stab) < 0) 946 goto nla_put_failure; 947 948 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, 949 NULL, &d, TCA_PAD) < 0) 950 goto nla_put_failure; 951 952 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) 953 goto nla_put_failure; 954 955 if (qdisc_is_percpu_stats(q)) { 956 cpu_bstats = q->cpu_bstats; 957 cpu_qstats = q->cpu_qstats; 958 } 959 960 if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 || 961 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 || 962 gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0) 963 goto nla_put_failure; 964 965 if (gnet_stats_finish_copy(&d) < 0) 966 goto nla_put_failure; 967 968 if (extack && extack->_msg && 969 nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) 970 goto out_nlmsg_trim; 971 972 nlh->nlmsg_len = skb_tail_pointer(skb) - b; 973 974 return skb->len; 975 976 out_nlmsg_trim: 977 nla_put_failure: 978 nlmsg_trim(skb, b); 979 return -1; 980 } 981 982 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible) 983 { 984 if (q->flags & TCQ_F_BUILTIN) 985 return true; 986 if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible) 987 return true; 988 989 return false; 990 } 991 992 static int qdisc_get_notify(struct net *net, struct sk_buff *oskb, 993 struct nlmsghdr *n, u32 clid, struct Qdisc *q, 994 struct netlink_ext_ack *extack) 995 { 996 struct sk_buff *skb; 997 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; 998 999 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 1000 if (!skb) 1001 return -ENOBUFS; 1002 1003 if (!tc_qdisc_dump_ignore(q, false)) { 1004 if (tc_fill_qdisc(skb, q, clid, portid, n->nlmsg_seq, 0, 1005 RTM_NEWQDISC, extack) < 0) 1006 goto err_out; 1007 } 1008 1009 if (skb->len) 1010 return rtnetlink_send(skb, net, portid, RTNLGRP_TC, 1011 n->nlmsg_flags & NLM_F_ECHO); 1012 1013 err_out: 1014 kfree_skb(skb); 1015 return -EINVAL; 1016 } 1017 1018 static int qdisc_notify(struct net *net, struct sk_buff *oskb, 1019 struct nlmsghdr *n, u32 clid, 1020 struct Qdisc *old, struct Qdisc *new, 1021 struct netlink_ext_ack *extack) 1022 { 1023 struct sk_buff *skb; 1024 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; 1025 1026 if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) 1027 return 0; 1028 1029 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 1030 if (!skb) 1031 return -ENOBUFS; 1032 1033 if (old && !tc_qdisc_dump_ignore(old, false)) { 1034 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq, 1035 0, RTM_DELQDISC, extack) < 0) 1036 goto err_out; 1037 } 1038 if (new && !tc_qdisc_dump_ignore(new, false)) { 1039 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq, 1040 old ? NLM_F_REPLACE : 0, RTM_NEWQDISC, extack) < 0) 1041 goto err_out; 1042 } 1043 1044 if (skb->len) 1045 return rtnetlink_send(skb, net, portid, RTNLGRP_TC, 1046 n->nlmsg_flags & NLM_F_ECHO); 1047 1048 err_out: 1049 kfree_skb(skb); 1050 return -EINVAL; 1051 } 1052 1053 static void notify_and_destroy(struct net *net, struct sk_buff *skb, 1054 struct nlmsghdr *n, u32 clid, 1055 struct Qdisc *old, struct Qdisc *new, 1056 struct netlink_ext_ack *extack) 1057 { 1058 if (new || old) 1059 qdisc_notify(net, skb, n, clid, old, new, extack); 1060 1061 if (old) 1062 qdisc_put(old); 1063 } 1064 1065 static void qdisc_clear_nolock(struct Qdisc *sch) 1066 { 1067 sch->flags &= ~TCQ_F_NOLOCK; 1068 if (!(sch->flags & TCQ_F_CPUSTATS)) 1069 return; 1070 1071 free_percpu(sch->cpu_bstats); 1072 free_percpu(sch->cpu_qstats); 1073 sch->cpu_bstats = NULL; 1074 sch->cpu_qstats = NULL; 1075 sch->flags &= ~TCQ_F_CPUSTATS; 1076 } 1077 1078 /* Graft qdisc "new" to class "classid" of qdisc "parent" or 1079 * to device "dev". 1080 * 1081 * When appropriate send a netlink notification using 'skb' 1082 * and "n". 1083 * 1084 * On success, destroy old qdisc. 1085 */ 1086 1087 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, 1088 struct sk_buff *skb, struct nlmsghdr *n, u32 classid, 1089 struct Qdisc *new, struct Qdisc *old, 1090 struct netlink_ext_ack *extack) 1091 { 1092 struct Qdisc *q = old; 1093 struct net *net = dev_net(dev); 1094 1095 if (parent == NULL) { 1096 unsigned int i, num_q, ingress; 1097 struct netdev_queue *dev_queue; 1098 1099 ingress = 0; 1100 num_q = dev->num_tx_queues; 1101 if ((q && q->flags & TCQ_F_INGRESS) || 1102 (new && new->flags & TCQ_F_INGRESS)) { 1103 ingress = 1; 1104 dev_queue = dev_ingress_queue(dev); 1105 if (!dev_queue) { 1106 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue"); 1107 return -ENOENT; 1108 } 1109 1110 q = rtnl_dereference(dev_queue->qdisc_sleeping); 1111 1112 /* This is the counterpart of that qdisc_refcount_inc_nz() call in 1113 * __tcf_qdisc_find() for filter requests. 1114 */ 1115 if (!qdisc_refcount_dec_if_one(q)) { 1116 NL_SET_ERR_MSG(extack, 1117 "Current ingress or clsact Qdisc has ongoing filter requests"); 1118 return -EBUSY; 1119 } 1120 } 1121 1122 if (dev->flags & IFF_UP) 1123 dev_deactivate(dev); 1124 1125 qdisc_offload_graft_root(dev, new, old, extack); 1126 1127 if (new && new->ops->attach && !ingress) 1128 goto skip; 1129 1130 if (!ingress) { 1131 for (i = 0; i < num_q; i++) { 1132 dev_queue = netdev_get_tx_queue(dev, i); 1133 old = dev_graft_qdisc(dev_queue, new); 1134 1135 if (new && i > 0) 1136 qdisc_refcount_inc(new); 1137 qdisc_put(old); 1138 } 1139 } else { 1140 old = dev_graft_qdisc(dev_queue, NULL); 1141 1142 /* {ingress,clsact}_destroy() @old before grafting @new to avoid 1143 * unprotected concurrent accesses to net_device::miniq_{in,e}gress 1144 * pointer(s) in mini_qdisc_pair_swap(). 1145 */ 1146 qdisc_notify(net, skb, n, classid, old, new, extack); 1147 qdisc_destroy(old); 1148 1149 dev_graft_qdisc(dev_queue, new); 1150 } 1151 1152 skip: 1153 if (!ingress) { 1154 old = rtnl_dereference(dev->qdisc); 1155 if (new && !new->ops->attach) 1156 qdisc_refcount_inc(new); 1157 rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc); 1158 1159 notify_and_destroy(net, skb, n, classid, old, new, extack); 1160 1161 if (new && new->ops->attach) 1162 new->ops->attach(new); 1163 } 1164 1165 if (dev->flags & IFF_UP) 1166 dev_activate(dev); 1167 } else { 1168 const struct Qdisc_class_ops *cops = parent->ops->cl_ops; 1169 unsigned long cl; 1170 int err; 1171 1172 /* Only support running class lockless if parent is lockless */ 1173 if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK)) 1174 qdisc_clear_nolock(new); 1175 1176 if (!cops || !cops->graft) 1177 return -EOPNOTSUPP; 1178 1179 cl = cops->find(parent, classid); 1180 if (!cl) { 1181 NL_SET_ERR_MSG(extack, "Specified class not found"); 1182 return -ENOENT; 1183 } 1184 1185 if (new && new->ops == &noqueue_qdisc_ops) { 1186 NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class"); 1187 return -EINVAL; 1188 } 1189 1190 if (new && 1191 !(parent->flags & TCQ_F_MQROOT) && 1192 rcu_access_pointer(new->stab)) { 1193 NL_SET_ERR_MSG(extack, "STAB not supported on a non root"); 1194 return -EINVAL; 1195 } 1196 err = cops->graft(parent, cl, new, &old, extack); 1197 if (err) 1198 return err; 1199 notify_and_destroy(net, skb, n, classid, old, new, extack); 1200 } 1201 return 0; 1202 } 1203 1204 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca, 1205 struct netlink_ext_ack *extack) 1206 { 1207 u32 block_index; 1208 1209 if (tca[TCA_INGRESS_BLOCK]) { 1210 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]); 1211 1212 if (!block_index) { 1213 NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0"); 1214 return -EINVAL; 1215 } 1216 if (!sch->ops->ingress_block_set) { 1217 NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported"); 1218 return -EOPNOTSUPP; 1219 } 1220 sch->ops->ingress_block_set(sch, block_index); 1221 } 1222 if (tca[TCA_EGRESS_BLOCK]) { 1223 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]); 1224 1225 if (!block_index) { 1226 NL_SET_ERR_MSG(extack, "Egress block index cannot be 0"); 1227 return -EINVAL; 1228 } 1229 if (!sch->ops->egress_block_set) { 1230 NL_SET_ERR_MSG(extack, "Egress block sharing is not supported"); 1231 return -EOPNOTSUPP; 1232 } 1233 sch->ops->egress_block_set(sch, block_index); 1234 } 1235 return 0; 1236 } 1237 1238 /* 1239 Allocate and initialize new qdisc. 1240 1241 Parameters are passed via opt. 1242 */ 1243 1244 static struct Qdisc *qdisc_create(struct net_device *dev, 1245 struct netdev_queue *dev_queue, 1246 u32 parent, u32 handle, 1247 struct nlattr **tca, int *errp, 1248 struct netlink_ext_ack *extack) 1249 { 1250 int err; 1251 struct nlattr *kind = tca[TCA_KIND]; 1252 struct Qdisc *sch; 1253 struct Qdisc_ops *ops; 1254 struct qdisc_size_table *stab; 1255 1256 ops = qdisc_lookup_ops(kind); 1257 if (!ops) { 1258 err = -ENOENT; 1259 NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown"); 1260 goto err_out; 1261 } 1262 1263 sch = qdisc_alloc(dev_queue, ops, extack); 1264 if (IS_ERR(sch)) { 1265 err = PTR_ERR(sch); 1266 goto err_out2; 1267 } 1268 1269 sch->parent = parent; 1270 1271 if (handle == TC_H_INGRESS) { 1272 if (!(sch->flags & TCQ_F_INGRESS)) { 1273 NL_SET_ERR_MSG(extack, 1274 "Specified parent ID is reserved for ingress and clsact Qdiscs"); 1275 err = -EINVAL; 1276 goto err_out3; 1277 } 1278 handle = TC_H_MAKE(TC_H_INGRESS, 0); 1279 } else { 1280 if (handle == 0) { 1281 handle = qdisc_alloc_handle(dev); 1282 if (handle == 0) { 1283 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded"); 1284 err = -ENOSPC; 1285 goto err_out3; 1286 } 1287 } 1288 if (!netif_is_multiqueue(dev)) 1289 sch->flags |= TCQ_F_ONETXQUEUE; 1290 } 1291 1292 sch->handle = handle; 1293 1294 /* This exist to keep backward compatible with a userspace 1295 * loophole, what allowed userspace to get IFF_NO_QUEUE 1296 * facility on older kernels by setting tx_queue_len=0 (prior 1297 * to qdisc init), and then forgot to reinit tx_queue_len 1298 * before again attaching a qdisc. 1299 */ 1300 if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) { 1301 WRITE_ONCE(dev->tx_queue_len, DEFAULT_TX_QUEUE_LEN); 1302 netdev_info(dev, "Caught tx_queue_len zero misconfig\n"); 1303 } 1304 1305 err = qdisc_block_indexes_set(sch, tca, extack); 1306 if (err) 1307 goto err_out3; 1308 1309 if (tca[TCA_STAB]) { 1310 stab = qdisc_get_stab(tca[TCA_STAB], extack); 1311 if (IS_ERR(stab)) { 1312 err = PTR_ERR(stab); 1313 goto err_out3; 1314 } 1315 rcu_assign_pointer(sch->stab, stab); 1316 } 1317 1318 if (ops->init) { 1319 err = ops->init(sch, tca[TCA_OPTIONS], extack); 1320 if (err != 0) 1321 goto err_out4; 1322 } 1323 1324 if (tca[TCA_RATE]) { 1325 err = -EOPNOTSUPP; 1326 if (sch->flags & TCQ_F_MQROOT) { 1327 NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc"); 1328 goto err_out4; 1329 } 1330 1331 err = gen_new_estimator(&sch->bstats, 1332 sch->cpu_bstats, 1333 &sch->rate_est, 1334 NULL, 1335 true, 1336 tca[TCA_RATE]); 1337 if (err) { 1338 NL_SET_ERR_MSG(extack, "Failed to generate new estimator"); 1339 goto err_out4; 1340 } 1341 } 1342 1343 qdisc_hash_add(sch, false); 1344 trace_qdisc_create(ops, dev, parent); 1345 1346 return sch; 1347 1348 err_out4: 1349 /* Even if ops->init() failed, we call ops->destroy() 1350 * like qdisc_create_dflt(). 1351 */ 1352 if (ops->destroy) 1353 ops->destroy(sch); 1354 qdisc_put_stab(rtnl_dereference(sch->stab)); 1355 err_out3: 1356 lockdep_unregister_key(&sch->root_lock_key); 1357 netdev_put(dev, &sch->dev_tracker); 1358 qdisc_free(sch); 1359 err_out2: 1360 bpf_module_put(ops, ops->owner); 1361 err_out: 1362 *errp = err; 1363 return NULL; 1364 } 1365 1366 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca, 1367 struct netlink_ext_ack *extack) 1368 { 1369 struct qdisc_size_table *ostab, *stab = NULL; 1370 int err = 0; 1371 1372 if (tca[TCA_OPTIONS]) { 1373 if (!sch->ops->change) { 1374 NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc"); 1375 return -EINVAL; 1376 } 1377 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) { 1378 NL_SET_ERR_MSG(extack, "Change of blocks is not supported"); 1379 return -EOPNOTSUPP; 1380 } 1381 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack); 1382 if (err) 1383 return err; 1384 } 1385 1386 if (tca[TCA_STAB]) { 1387 stab = qdisc_get_stab(tca[TCA_STAB], extack); 1388 if (IS_ERR(stab)) 1389 return PTR_ERR(stab); 1390 } 1391 1392 ostab = rtnl_dereference(sch->stab); 1393 rcu_assign_pointer(sch->stab, stab); 1394 qdisc_put_stab(ostab); 1395 1396 if (tca[TCA_RATE]) { 1397 /* NB: ignores errors from replace_estimator 1398 because change can't be undone. */ 1399 if (sch->flags & TCQ_F_MQROOT) 1400 goto out; 1401 gen_replace_estimator(&sch->bstats, 1402 sch->cpu_bstats, 1403 &sch->rate_est, 1404 NULL, 1405 true, 1406 tca[TCA_RATE]); 1407 } 1408 out: 1409 return 0; 1410 } 1411 1412 struct check_loop_arg { 1413 struct qdisc_walker w; 1414 struct Qdisc *p; 1415 int depth; 1416 }; 1417 1418 static int check_loop_fn(struct Qdisc *q, unsigned long cl, 1419 struct qdisc_walker *w); 1420 1421 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth) 1422 { 1423 struct check_loop_arg arg; 1424 1425 if (q->ops->cl_ops == NULL) 1426 return 0; 1427 1428 arg.w.stop = arg.w.skip = arg.w.count = 0; 1429 arg.w.fn = check_loop_fn; 1430 arg.depth = depth; 1431 arg.p = p; 1432 q->ops->cl_ops->walk(q, &arg.w); 1433 return arg.w.stop ? -ELOOP : 0; 1434 } 1435 1436 static int 1437 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) 1438 { 1439 struct Qdisc *leaf; 1440 const struct Qdisc_class_ops *cops = q->ops->cl_ops; 1441 struct check_loop_arg *arg = (struct check_loop_arg *)w; 1442 1443 leaf = cops->leaf(q, cl); 1444 if (leaf) { 1445 if (leaf == arg->p || arg->depth > 7) 1446 return -ELOOP; 1447 return check_loop(leaf, arg->p, arg->depth + 1); 1448 } 1449 return 0; 1450 } 1451 1452 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = { 1453 [TCA_KIND] = { .type = NLA_STRING }, 1454 [TCA_RATE] = { .type = NLA_BINARY, 1455 .len = sizeof(struct tc_estimator) }, 1456 [TCA_STAB] = { .type = NLA_NESTED }, 1457 [TCA_DUMP_INVISIBLE] = { .type = NLA_FLAG }, 1458 [TCA_CHAIN] = { .type = NLA_U32 }, 1459 [TCA_INGRESS_BLOCK] = { .type = NLA_U32 }, 1460 [TCA_EGRESS_BLOCK] = { .type = NLA_U32 }, 1461 }; 1462 1463 /* 1464 * Delete/get qdisc. 1465 */ 1466 1467 static int __tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, 1468 struct netlink_ext_ack *extack, 1469 struct net_device *dev, 1470 struct nlattr *tca[TCA_MAX + 1], 1471 struct tcmsg *tcm) 1472 { 1473 struct net *net = sock_net(skb->sk); 1474 struct Qdisc *q = NULL; 1475 struct Qdisc *p = NULL; 1476 u32 clid; 1477 int err; 1478 1479 clid = tcm->tcm_parent; 1480 if (clid) { 1481 if (clid != TC_H_ROOT) { 1482 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) { 1483 p = qdisc_lookup(dev, TC_H_MAJ(clid)); 1484 if (!p) { 1485 NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid"); 1486 return -ENOENT; 1487 } 1488 q = qdisc_leaf(p, clid, extack); 1489 } else if (dev_ingress_queue(dev)) { 1490 q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping); 1491 } 1492 } else { 1493 q = rtnl_dereference(dev->qdisc); 1494 } 1495 if (!q) { 1496 NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device"); 1497 return -ENOENT; 1498 } 1499 if (IS_ERR(q)) 1500 return PTR_ERR(q); 1501 1502 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) { 1503 NL_SET_ERR_MSG(extack, "Invalid handle"); 1504 return -EINVAL; 1505 } 1506 } else { 1507 q = qdisc_lookup(dev, tcm->tcm_handle); 1508 if (!q) { 1509 NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle"); 1510 return -ENOENT; 1511 } 1512 } 1513 1514 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { 1515 NL_SET_ERR_MSG(extack, "Invalid qdisc name: must match existing qdisc"); 1516 return -EINVAL; 1517 } 1518 1519 if (n->nlmsg_type == RTM_DELQDISC) { 1520 if (!clid) { 1521 NL_SET_ERR_MSG(extack, "Classid cannot be zero"); 1522 return -EINVAL; 1523 } 1524 if (q->handle == 0) { 1525 NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero"); 1526 return -ENOENT; 1527 } 1528 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack); 1529 if (err != 0) 1530 return err; 1531 } else { 1532 qdisc_get_notify(net, skb, n, clid, q, NULL); 1533 } 1534 return 0; 1535 } 1536 1537 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, 1538 struct netlink_ext_ack *extack) 1539 { 1540 struct net *net = sock_net(skb->sk); 1541 struct tcmsg *tcm = nlmsg_data(n); 1542 struct nlattr *tca[TCA_MAX + 1]; 1543 struct net_device *dev; 1544 int err; 1545 1546 err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, 1547 rtm_tca_policy, extack); 1548 if (err < 0) 1549 return err; 1550 1551 dev = __dev_get_by_index(net, tcm->tcm_ifindex); 1552 if (!dev) 1553 return -ENODEV; 1554 1555 netdev_lock_ops(dev); 1556 err = __tc_get_qdisc(skb, n, extack, dev, tca, tcm); 1557 netdev_unlock_ops(dev); 1558 1559 return err; 1560 } 1561 1562 static bool req_create_or_replace(struct nlmsghdr *n) 1563 { 1564 return (n->nlmsg_flags & NLM_F_CREATE && 1565 n->nlmsg_flags & NLM_F_REPLACE); 1566 } 1567 1568 static bool req_create_exclusive(struct nlmsghdr *n) 1569 { 1570 return (n->nlmsg_flags & NLM_F_CREATE && 1571 n->nlmsg_flags & NLM_F_EXCL); 1572 } 1573 1574 static bool req_change(struct nlmsghdr *n) 1575 { 1576 return (!(n->nlmsg_flags & NLM_F_CREATE) && 1577 !(n->nlmsg_flags & NLM_F_REPLACE) && 1578 !(n->nlmsg_flags & NLM_F_EXCL)); 1579 } 1580 1581 static int __tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, 1582 struct netlink_ext_ack *extack, 1583 struct net_device *dev, 1584 struct nlattr *tca[TCA_MAX + 1], 1585 struct tcmsg *tcm) 1586 { 1587 struct Qdisc *q = NULL; 1588 struct Qdisc *p = NULL; 1589 u32 clid; 1590 int err; 1591 1592 clid = tcm->tcm_parent; 1593 1594 if (clid) { 1595 if (clid != TC_H_ROOT) { 1596 if (clid != TC_H_INGRESS) { 1597 p = qdisc_lookup(dev, TC_H_MAJ(clid)); 1598 if (!p) { 1599 NL_SET_ERR_MSG(extack, "Failed to find specified qdisc"); 1600 return -ENOENT; 1601 } 1602 if (p->flags & TCQ_F_INGRESS) { 1603 NL_SET_ERR_MSG(extack, 1604 "Cannot add children to ingress/clsact qdisc"); 1605 return -EOPNOTSUPP; 1606 } 1607 q = qdisc_leaf(p, clid, extack); 1608 if (IS_ERR(q)) 1609 return PTR_ERR(q); 1610 } else if (dev_ingress_queue_create(dev)) { 1611 q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping); 1612 } 1613 } else { 1614 q = rtnl_dereference(dev->qdisc); 1615 } 1616 1617 /* It may be default qdisc, ignore it */ 1618 if (q && q->handle == 0) 1619 q = NULL; 1620 1621 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) { 1622 if (tcm->tcm_handle) { 1623 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) { 1624 NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override"); 1625 return -EEXIST; 1626 } 1627 if (TC_H_MIN(tcm->tcm_handle)) { 1628 NL_SET_ERR_MSG(extack, "Invalid minor handle"); 1629 return -EINVAL; 1630 } 1631 q = qdisc_lookup(dev, tcm->tcm_handle); 1632 if (!q) 1633 goto create_n_graft; 1634 if (q->parent != tcm->tcm_parent) { 1635 NL_SET_ERR_MSG(extack, "Cannot move an existing qdisc to a different parent"); 1636 return -EINVAL; 1637 } 1638 if (n->nlmsg_flags & NLM_F_EXCL) { 1639 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override"); 1640 return -EEXIST; 1641 } 1642 if (tca[TCA_KIND] && 1643 nla_strcmp(tca[TCA_KIND], q->ops->id)) { 1644 NL_SET_ERR_MSG(extack, "Invalid qdisc name: must match existing qdisc"); 1645 return -EINVAL; 1646 } 1647 if (q->flags & TCQ_F_INGRESS) { 1648 NL_SET_ERR_MSG(extack, 1649 "Cannot regraft ingress or clsact Qdiscs"); 1650 return -EINVAL; 1651 } 1652 if (q == p || 1653 (p && check_loop(q, p, 0))) { 1654 NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected"); 1655 return -ELOOP; 1656 } 1657 if (clid == TC_H_INGRESS) { 1658 NL_SET_ERR_MSG(extack, "Ingress cannot graft directly"); 1659 return -EINVAL; 1660 } 1661 qdisc_refcount_inc(q); 1662 goto graft; 1663 } else { 1664 if (!q) 1665 goto create_n_graft; 1666 1667 /* This magic test requires explanation. 1668 * 1669 * We know, that some child q is already 1670 * attached to this parent and have choice: 1671 * 1) change it or 2) create/graft new one. 1672 * If the requested qdisc kind is different 1673 * than the existing one, then we choose graft. 1674 * If they are the same then this is "change" 1675 * operation - just let it fallthrough.. 1676 * 1677 * 1. We are allowed to create/graft only 1678 * if the request is explicitly stating 1679 * "please create if it doesn't exist". 1680 * 1681 * 2. If the request is to exclusive create 1682 * then the qdisc tcm_handle is not expected 1683 * to exist, so that we choose create/graft too. 1684 * 1685 * 3. The last case is when no flags are set. 1686 * This will happen when for example tc 1687 * utility issues a "change" command. 1688 * Alas, it is sort of hole in API, we 1689 * cannot decide what to do unambiguously. 1690 * For now we select create/graft. 1691 */ 1692 if (tca[TCA_KIND] && 1693 nla_strcmp(tca[TCA_KIND], q->ops->id)) { 1694 if (req_create_or_replace(n) || 1695 req_create_exclusive(n)) 1696 goto create_n_graft; 1697 else if (req_change(n)) 1698 goto create_n_graft2; 1699 } 1700 } 1701 } 1702 } else { 1703 if (!tcm->tcm_handle) { 1704 NL_SET_ERR_MSG(extack, "Handle cannot be zero"); 1705 return -EINVAL; 1706 } 1707 q = qdisc_lookup(dev, tcm->tcm_handle); 1708 } 1709 1710 /* Change qdisc parameters */ 1711 if (!q) { 1712 NL_SET_ERR_MSG(extack, "Specified qdisc not found"); 1713 return -ENOENT; 1714 } 1715 if (n->nlmsg_flags & NLM_F_EXCL) { 1716 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify"); 1717 return -EEXIST; 1718 } 1719 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { 1720 NL_SET_ERR_MSG(extack, "Invalid qdisc name: must match existing qdisc"); 1721 return -EINVAL; 1722 } 1723 err = qdisc_change(q, tca, extack); 1724 if (err == 0) 1725 qdisc_notify(sock_net(skb->sk), skb, n, clid, NULL, q, extack); 1726 return err; 1727 1728 create_n_graft: 1729 if (!(n->nlmsg_flags & NLM_F_CREATE)) { 1730 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag"); 1731 return -ENOENT; 1732 } 1733 create_n_graft2: 1734 if (clid == TC_H_INGRESS) { 1735 if (dev_ingress_queue(dev)) { 1736 q = qdisc_create(dev, dev_ingress_queue(dev), 1737 tcm->tcm_parent, tcm->tcm_parent, 1738 tca, &err, extack); 1739 } else { 1740 NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device"); 1741 err = -ENOENT; 1742 } 1743 } else { 1744 struct netdev_queue *dev_queue; 1745 1746 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue) 1747 dev_queue = p->ops->cl_ops->select_queue(p, tcm); 1748 else if (p) 1749 dev_queue = p->dev_queue; 1750 else 1751 dev_queue = netdev_get_tx_queue(dev, 0); 1752 1753 q = qdisc_create(dev, dev_queue, 1754 tcm->tcm_parent, tcm->tcm_handle, 1755 tca, &err, extack); 1756 } 1757 if (!q) 1758 return err; 1759 1760 graft: 1761 err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack); 1762 if (err) { 1763 if (q) 1764 qdisc_put(q); 1765 return err; 1766 } 1767 1768 return 0; 1769 } 1770 1771 static void request_qdisc_module(struct nlattr *kind) 1772 { 1773 struct Qdisc_ops *ops; 1774 char name[IFNAMSIZ]; 1775 1776 if (!kind) 1777 return; 1778 1779 ops = qdisc_lookup_ops(kind); 1780 if (ops) { 1781 bpf_module_put(ops, ops->owner); 1782 return; 1783 } 1784 1785 if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) { 1786 rtnl_unlock(); 1787 request_module(NET_SCH_ALIAS_PREFIX "%s", name); 1788 rtnl_lock(); 1789 } 1790 } 1791 1792 /* 1793 * Create/change qdisc. 1794 */ 1795 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, 1796 struct netlink_ext_ack *extack) 1797 { 1798 struct net *net = sock_net(skb->sk); 1799 struct nlattr *tca[TCA_MAX + 1]; 1800 struct net_device *dev; 1801 struct tcmsg *tcm; 1802 int err; 1803 1804 err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, 1805 rtm_tca_policy, extack); 1806 if (err < 0) 1807 return err; 1808 1809 request_qdisc_module(tca[TCA_KIND]); 1810 1811 tcm = nlmsg_data(n); 1812 dev = __dev_get_by_index(net, tcm->tcm_ifindex); 1813 if (!dev) 1814 return -ENODEV; 1815 1816 netdev_lock_ops(dev); 1817 err = __tc_modify_qdisc(skb, n, extack, dev, tca, tcm); 1818 netdev_unlock_ops(dev); 1819 1820 return err; 1821 } 1822 1823 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, 1824 struct netlink_callback *cb, 1825 int *q_idx_p, int s_q_idx, bool recur, 1826 bool dump_invisible) 1827 { 1828 int ret = 0, q_idx = *q_idx_p; 1829 struct Qdisc *q; 1830 int b; 1831 1832 if (!root) 1833 return 0; 1834 1835 q = root; 1836 if (q_idx < s_q_idx) { 1837 q_idx++; 1838 } else { 1839 if (!tc_qdisc_dump_ignore(q, dump_invisible) && 1840 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, 1841 cb->nlh->nlmsg_seq, NLM_F_MULTI, 1842 RTM_NEWQDISC, NULL) <= 0) 1843 goto done; 1844 q_idx++; 1845 } 1846 1847 /* If dumping singletons, there is no qdisc_dev(root) and the singleton 1848 * itself has already been dumped. 1849 * 1850 * If we've already dumped the top-level (ingress) qdisc above and the global 1851 * qdisc hashtable, we don't want to hit it again 1852 */ 1853 if (!qdisc_dev(root) || !recur) 1854 goto out; 1855 1856 hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) { 1857 if (q_idx < s_q_idx) { 1858 q_idx++; 1859 continue; 1860 } 1861 if (!tc_qdisc_dump_ignore(q, dump_invisible) && 1862 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, 1863 cb->nlh->nlmsg_seq, NLM_F_MULTI, 1864 RTM_NEWQDISC, NULL) <= 0) 1865 goto done; 1866 q_idx++; 1867 } 1868 1869 out: 1870 *q_idx_p = q_idx; 1871 return ret; 1872 done: 1873 ret = -1; 1874 goto out; 1875 } 1876 1877 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) 1878 { 1879 struct net *net = sock_net(skb->sk); 1880 int idx, q_idx; 1881 int s_idx, s_q_idx; 1882 struct net_device *dev; 1883 const struct nlmsghdr *nlh = cb->nlh; 1884 struct nlattr *tca[TCA_MAX + 1]; 1885 int err; 1886 1887 s_idx = cb->args[0]; 1888 s_q_idx = q_idx = cb->args[1]; 1889 1890 idx = 0; 1891 ASSERT_RTNL(); 1892 1893 err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX, 1894 rtm_tca_policy, cb->extack); 1895 if (err < 0) 1896 return err; 1897 1898 for_each_netdev(net, dev) { 1899 struct netdev_queue *dev_queue; 1900 1901 if (idx < s_idx) 1902 goto cont; 1903 if (idx > s_idx) 1904 s_q_idx = 0; 1905 q_idx = 0; 1906 1907 netdev_lock_ops(dev); 1908 if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc), 1909 skb, cb, &q_idx, s_q_idx, 1910 true, tca[TCA_DUMP_INVISIBLE]) < 0) { 1911 netdev_unlock_ops(dev); 1912 goto done; 1913 } 1914 1915 dev_queue = dev_ingress_queue(dev); 1916 if (dev_queue && 1917 tc_dump_qdisc_root(rtnl_dereference(dev_queue->qdisc_sleeping), 1918 skb, cb, &q_idx, s_q_idx, false, 1919 tca[TCA_DUMP_INVISIBLE]) < 0) { 1920 netdev_unlock_ops(dev); 1921 goto done; 1922 } 1923 netdev_unlock_ops(dev); 1924 1925 cont: 1926 idx++; 1927 } 1928 1929 done: 1930 cb->args[0] = idx; 1931 cb->args[1] = q_idx; 1932 1933 return skb->len; 1934 } 1935 1936 1937 1938 /************************************************ 1939 * Traffic classes manipulation. * 1940 ************************************************/ 1941 1942 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, 1943 unsigned long cl, u32 portid, u32 seq, u16 flags, 1944 int event, struct netlink_ext_ack *extack) 1945 { 1946 struct tcmsg *tcm; 1947 struct nlmsghdr *nlh; 1948 unsigned char *b = skb_tail_pointer(skb); 1949 struct gnet_dump d; 1950 const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops; 1951 1952 cond_resched(); 1953 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); 1954 if (!nlh) 1955 goto out_nlmsg_trim; 1956 tcm = nlmsg_data(nlh); 1957 tcm->tcm_family = AF_UNSPEC; 1958 tcm->tcm__pad1 = 0; 1959 tcm->tcm__pad2 = 0; 1960 tcm->tcm_ifindex = qdisc_dev(q)->ifindex; 1961 tcm->tcm_parent = q->handle; 1962 tcm->tcm_handle = q->handle; 1963 tcm->tcm_info = 0; 1964 if (nla_put_string(skb, TCA_KIND, q->ops->id)) 1965 goto nla_put_failure; 1966 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0) 1967 goto nla_put_failure; 1968 1969 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, 1970 NULL, &d, TCA_PAD) < 0) 1971 goto nla_put_failure; 1972 1973 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0) 1974 goto nla_put_failure; 1975 1976 if (gnet_stats_finish_copy(&d) < 0) 1977 goto nla_put_failure; 1978 1979 if (extack && extack->_msg && 1980 nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) 1981 goto out_nlmsg_trim; 1982 1983 nlh->nlmsg_len = skb_tail_pointer(skb) - b; 1984 1985 return skb->len; 1986 1987 out_nlmsg_trim: 1988 nla_put_failure: 1989 nlmsg_trim(skb, b); 1990 return -1; 1991 } 1992 1993 static int tclass_notify(struct net *net, struct sk_buff *oskb, 1994 struct nlmsghdr *n, struct Qdisc *q, 1995 unsigned long cl, int event, struct netlink_ext_ack *extack) 1996 { 1997 struct sk_buff *skb; 1998 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; 1999 2000 if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) 2001 return 0; 2002 2003 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 2004 if (!skb) 2005 return -ENOBUFS; 2006 2007 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event, extack) < 0) { 2008 kfree_skb(skb); 2009 return -EINVAL; 2010 } 2011 2012 return rtnetlink_send(skb, net, portid, RTNLGRP_TC, 2013 n->nlmsg_flags & NLM_F_ECHO); 2014 } 2015 2016 static int tclass_get_notify(struct net *net, struct sk_buff *oskb, 2017 struct nlmsghdr *n, struct Qdisc *q, 2018 unsigned long cl, struct netlink_ext_ack *extack) 2019 { 2020 struct sk_buff *skb; 2021 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; 2022 2023 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 2024 if (!skb) 2025 return -ENOBUFS; 2026 2027 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, RTM_NEWTCLASS, 2028 extack) < 0) { 2029 kfree_skb(skb); 2030 return -EINVAL; 2031 } 2032 2033 return rtnetlink_send(skb, net, portid, RTNLGRP_TC, 2034 n->nlmsg_flags & NLM_F_ECHO); 2035 } 2036 2037 static int tclass_del_notify(struct net *net, 2038 const struct Qdisc_class_ops *cops, 2039 struct sk_buff *oskb, struct nlmsghdr *n, 2040 struct Qdisc *q, unsigned long cl, 2041 struct netlink_ext_ack *extack) 2042 { 2043 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; 2044 struct sk_buff *skb; 2045 int err = 0; 2046 2047 if (!cops->delete) 2048 return -EOPNOTSUPP; 2049 2050 if (rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) { 2051 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 2052 if (!skb) 2053 return -ENOBUFS; 2054 2055 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, 2056 RTM_DELTCLASS, extack) < 0) { 2057 kfree_skb(skb); 2058 return -EINVAL; 2059 } 2060 } else { 2061 skb = NULL; 2062 } 2063 2064 err = cops->delete(q, cl, extack); 2065 if (err) { 2066 kfree_skb(skb); 2067 return err; 2068 } 2069 2070 err = rtnetlink_maybe_send(skb, net, portid, RTNLGRP_TC, 2071 n->nlmsg_flags & NLM_F_ECHO); 2072 return err; 2073 } 2074 2075 #ifdef CONFIG_NET_CLS 2076 2077 struct tcf_bind_args { 2078 struct tcf_walker w; 2079 unsigned long base; 2080 unsigned long cl; 2081 u32 classid; 2082 }; 2083 2084 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg) 2085 { 2086 struct tcf_bind_args *a = (void *)arg; 2087 2088 if (n && tp->ops->bind_class) { 2089 struct Qdisc *q = tcf_block_q(tp->chain->block); 2090 2091 sch_tree_lock(q); 2092 tp->ops->bind_class(n, a->classid, a->cl, q, a->base); 2093 sch_tree_unlock(q); 2094 } 2095 return 0; 2096 } 2097 2098 struct tc_bind_class_args { 2099 struct qdisc_walker w; 2100 unsigned long new_cl; 2101 u32 portid; 2102 u32 clid; 2103 }; 2104 2105 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl, 2106 struct qdisc_walker *w) 2107 { 2108 struct tc_bind_class_args *a = (struct tc_bind_class_args *)w; 2109 const struct Qdisc_class_ops *cops = q->ops->cl_ops; 2110 struct tcf_block *block; 2111 struct tcf_chain *chain; 2112 2113 block = cops->tcf_block(q, cl, NULL); 2114 if (!block) 2115 return 0; 2116 for (chain = tcf_get_next_chain(block, NULL); 2117 chain; 2118 chain = tcf_get_next_chain(block, chain)) { 2119 struct tcf_proto *tp; 2120 2121 for (tp = tcf_get_next_proto(chain, NULL); 2122 tp; tp = tcf_get_next_proto(chain, tp)) { 2123 struct tcf_bind_args arg = {}; 2124 2125 arg.w.fn = tcf_node_bind; 2126 arg.classid = a->clid; 2127 arg.base = cl; 2128 arg.cl = a->new_cl; 2129 tp->ops->walk(tp, &arg.w, true); 2130 } 2131 } 2132 2133 return 0; 2134 } 2135 2136 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, 2137 unsigned long new_cl) 2138 { 2139 const struct Qdisc_class_ops *cops = q->ops->cl_ops; 2140 struct tc_bind_class_args args = {}; 2141 2142 if (!cops->tcf_block) 2143 return; 2144 args.portid = portid; 2145 args.clid = clid; 2146 args.new_cl = new_cl; 2147 args.w.fn = tc_bind_class_walker; 2148 q->ops->cl_ops->walk(q, &args.w); 2149 } 2150 2151 #else 2152 2153 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, 2154 unsigned long new_cl) 2155 { 2156 } 2157 2158 #endif 2159 2160 static int __tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, 2161 struct netlink_ext_ack *extack, 2162 struct net_device *dev, 2163 struct nlattr *tca[TCA_MAX + 1], 2164 struct tcmsg *tcm) 2165 { 2166 struct net *net = sock_net(skb->sk); 2167 const struct Qdisc_class_ops *cops; 2168 struct Qdisc *q = NULL; 2169 unsigned long cl = 0; 2170 unsigned long new_cl; 2171 u32 portid; 2172 u32 clid; 2173 u32 qid; 2174 int err; 2175 2176 /* 2177 parent == TC_H_UNSPEC - unspecified parent. 2178 parent == TC_H_ROOT - class is root, which has no parent. 2179 parent == X:0 - parent is root class. 2180 parent == X:Y - parent is a node in hierarchy. 2181 parent == 0:Y - parent is X:Y, where X:0 is qdisc. 2182 2183 handle == 0:0 - generate handle from kernel pool. 2184 handle == 0:Y - class is X:Y, where X:0 is qdisc. 2185 handle == X:Y - clear. 2186 handle == X:0 - root class. 2187 */ 2188 2189 /* Step 1. Determine qdisc handle X:0 */ 2190 2191 portid = tcm->tcm_parent; 2192 clid = tcm->tcm_handle; 2193 qid = TC_H_MAJ(clid); 2194 2195 if (portid != TC_H_ROOT) { 2196 u32 qid1 = TC_H_MAJ(portid); 2197 2198 if (qid && qid1) { 2199 /* If both majors are known, they must be identical. */ 2200 if (qid != qid1) 2201 return -EINVAL; 2202 } else if (qid1) { 2203 qid = qid1; 2204 } else if (qid == 0) 2205 qid = rtnl_dereference(dev->qdisc)->handle; 2206 2207 /* Now qid is genuine qdisc handle consistent 2208 * both with parent and child. 2209 * 2210 * TC_H_MAJ(portid) still may be unspecified, complete it now. 2211 */ 2212 if (portid) 2213 portid = TC_H_MAKE(qid, portid); 2214 } else { 2215 if (qid == 0) 2216 qid = rtnl_dereference(dev->qdisc)->handle; 2217 } 2218 2219 /* OK. Locate qdisc */ 2220 q = qdisc_lookup(dev, qid); 2221 if (!q) 2222 return -ENOENT; 2223 2224 /* An check that it supports classes */ 2225 cops = q->ops->cl_ops; 2226 if (cops == NULL) 2227 return -EINVAL; 2228 2229 /* Now try to get class */ 2230 if (clid == 0) { 2231 if (portid == TC_H_ROOT) 2232 clid = qid; 2233 } else 2234 clid = TC_H_MAKE(qid, clid); 2235 2236 if (clid) 2237 cl = cops->find(q, clid); 2238 2239 if (cl == 0) { 2240 err = -ENOENT; 2241 if (n->nlmsg_type != RTM_NEWTCLASS || 2242 !(n->nlmsg_flags & NLM_F_CREATE)) 2243 goto out; 2244 } else { 2245 switch (n->nlmsg_type) { 2246 case RTM_NEWTCLASS: 2247 err = -EEXIST; 2248 if (n->nlmsg_flags & NLM_F_EXCL) 2249 goto out; 2250 break; 2251 case RTM_DELTCLASS: 2252 err = tclass_del_notify(net, cops, skb, n, q, cl, extack); 2253 /* Unbind the class with flilters with 0 */ 2254 tc_bind_tclass(q, portid, clid, 0); 2255 goto out; 2256 case RTM_GETTCLASS: 2257 err = tclass_get_notify(net, skb, n, q, cl, extack); 2258 goto out; 2259 default: 2260 err = -EINVAL; 2261 goto out; 2262 } 2263 } 2264 2265 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) { 2266 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes"); 2267 return -EOPNOTSUPP; 2268 } 2269 2270 /* Prevent creation of traffic classes with classid TC_H_ROOT */ 2271 if (clid == TC_H_ROOT) { 2272 NL_SET_ERR_MSG(extack, "Cannot create traffic class with classid TC_H_ROOT"); 2273 return -EINVAL; 2274 } 2275 2276 new_cl = cl; 2277 err = -EOPNOTSUPP; 2278 if (cops->change) 2279 err = cops->change(q, clid, portid, tca, &new_cl, extack); 2280 if (err == 0) { 2281 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS, extack); 2282 /* We just create a new class, need to do reverse binding. */ 2283 if (cl != new_cl) 2284 tc_bind_tclass(q, portid, clid, new_cl); 2285 } 2286 out: 2287 return err; 2288 } 2289 2290 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, 2291 struct netlink_ext_ack *extack) 2292 { 2293 struct net *net = sock_net(skb->sk); 2294 struct tcmsg *tcm = nlmsg_data(n); 2295 struct nlattr *tca[TCA_MAX + 1]; 2296 struct net_device *dev; 2297 int err; 2298 2299 err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, 2300 rtm_tca_policy, extack); 2301 if (err < 0) 2302 return err; 2303 2304 dev = __dev_get_by_index(net, tcm->tcm_ifindex); 2305 if (!dev) 2306 return -ENODEV; 2307 2308 netdev_lock_ops(dev); 2309 err = __tc_ctl_tclass(skb, n, extack, dev, tca, tcm); 2310 netdev_unlock_ops(dev); 2311 2312 return err; 2313 } 2314 2315 struct qdisc_dump_args { 2316 struct qdisc_walker w; 2317 struct sk_buff *skb; 2318 struct netlink_callback *cb; 2319 }; 2320 2321 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, 2322 struct qdisc_walker *arg) 2323 { 2324 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg; 2325 2326 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid, 2327 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, 2328 RTM_NEWTCLASS, NULL); 2329 } 2330 2331 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb, 2332 struct tcmsg *tcm, struct netlink_callback *cb, 2333 int *t_p, int s_t) 2334 { 2335 struct qdisc_dump_args arg; 2336 2337 if (tc_qdisc_dump_ignore(q, false) || 2338 *t_p < s_t || !q->ops->cl_ops || 2339 (tcm->tcm_parent && 2340 TC_H_MAJ(tcm->tcm_parent) != q->handle)) { 2341 (*t_p)++; 2342 return 0; 2343 } 2344 if (*t_p > s_t) 2345 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); 2346 arg.w.fn = qdisc_class_dump; 2347 arg.skb = skb; 2348 arg.cb = cb; 2349 arg.w.stop = 0; 2350 arg.w.skip = cb->args[1]; 2351 arg.w.count = 0; 2352 q->ops->cl_ops->walk(q, &arg.w); 2353 cb->args[1] = arg.w.count; 2354 if (arg.w.stop) 2355 return -1; 2356 (*t_p)++; 2357 return 0; 2358 } 2359 2360 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb, 2361 struct tcmsg *tcm, struct netlink_callback *cb, 2362 int *t_p, int s_t, bool recur) 2363 { 2364 struct Qdisc *q; 2365 int b; 2366 2367 if (!root) 2368 return 0; 2369 2370 if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0) 2371 return -1; 2372 2373 if (!qdisc_dev(root) || !recur) 2374 return 0; 2375 2376 if (tcm->tcm_parent) { 2377 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent)); 2378 if (q && q != root && 2379 tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0) 2380 return -1; 2381 return 0; 2382 } 2383 hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) { 2384 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0) 2385 return -1; 2386 } 2387 2388 return 0; 2389 } 2390 2391 static int __tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb, 2392 struct tcmsg *tcm, struct net_device *dev) 2393 { 2394 struct netdev_queue *dev_queue; 2395 int t, s_t; 2396 2397 s_t = cb->args[0]; 2398 t = 0; 2399 2400 if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc), 2401 skb, tcm, cb, &t, s_t, true) < 0) 2402 goto done; 2403 2404 dev_queue = dev_ingress_queue(dev); 2405 if (dev_queue && 2406 tc_dump_tclass_root(rtnl_dereference(dev_queue->qdisc_sleeping), 2407 skb, tcm, cb, &t, s_t, false) < 0) 2408 goto done; 2409 2410 done: 2411 cb->args[0] = t; 2412 2413 return skb->len; 2414 } 2415 2416 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) 2417 { 2418 struct tcmsg *tcm = nlmsg_data(cb->nlh); 2419 struct net *net = sock_net(skb->sk); 2420 struct net_device *dev; 2421 int err; 2422 2423 if (nlmsg_len(cb->nlh) < sizeof(*tcm)) 2424 return 0; 2425 2426 dev = dev_get_by_index(net, tcm->tcm_ifindex); 2427 if (!dev) 2428 return 0; 2429 2430 netdev_lock_ops(dev); 2431 err = __tc_dump_tclass(skb, cb, tcm, dev); 2432 netdev_unlock_ops(dev); 2433 2434 dev_put(dev); 2435 2436 return err; 2437 } 2438 2439 #ifdef CONFIG_PROC_FS 2440 static int psched_show(struct seq_file *seq, void *v) 2441 { 2442 seq_printf(seq, "%08x %08x %08x %08x\n", 2443 (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1), 2444 1000000, 2445 (u32)NSEC_PER_SEC / hrtimer_resolution); 2446 2447 return 0; 2448 } 2449 2450 static int __net_init psched_net_init(struct net *net) 2451 { 2452 struct proc_dir_entry *e; 2453 2454 e = proc_create_single("psched", 0, net->proc_net, psched_show); 2455 if (e == NULL) 2456 return -ENOMEM; 2457 2458 return 0; 2459 } 2460 2461 static void __net_exit psched_net_exit(struct net *net) 2462 { 2463 remove_proc_entry("psched", net->proc_net); 2464 } 2465 #else 2466 static int __net_init psched_net_init(struct net *net) 2467 { 2468 return 0; 2469 } 2470 2471 static void __net_exit psched_net_exit(struct net *net) 2472 { 2473 } 2474 #endif 2475 2476 static struct pernet_operations psched_net_ops = { 2477 .init = psched_net_init, 2478 .exit = psched_net_exit, 2479 }; 2480 2481 #if IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) 2482 DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper); 2483 #endif 2484 2485 static const struct rtnl_msg_handler psched_rtnl_msg_handlers[] __initconst = { 2486 {.msgtype = RTM_NEWQDISC, .doit = tc_modify_qdisc}, 2487 {.msgtype = RTM_DELQDISC, .doit = tc_get_qdisc}, 2488 {.msgtype = RTM_GETQDISC, .doit = tc_get_qdisc, 2489 .dumpit = tc_dump_qdisc}, 2490 {.msgtype = RTM_NEWTCLASS, .doit = tc_ctl_tclass}, 2491 {.msgtype = RTM_DELTCLASS, .doit = tc_ctl_tclass}, 2492 {.msgtype = RTM_GETTCLASS, .doit = tc_ctl_tclass, 2493 .dumpit = tc_dump_tclass}, 2494 }; 2495 2496 static int __init pktsched_init(void) 2497 { 2498 int err; 2499 2500 err = register_pernet_subsys(&psched_net_ops); 2501 if (err) { 2502 pr_err("pktsched_init: " 2503 "cannot initialize per netns operations\n"); 2504 return err; 2505 } 2506 2507 register_qdisc(&pfifo_fast_ops); 2508 register_qdisc(&pfifo_qdisc_ops); 2509 register_qdisc(&bfifo_qdisc_ops); 2510 register_qdisc(&pfifo_head_drop_qdisc_ops); 2511 register_qdisc(&mq_qdisc_ops); 2512 register_qdisc(&noqueue_qdisc_ops); 2513 2514 rtnl_register_many(psched_rtnl_msg_handlers); 2515 2516 tc_wrapper_init(); 2517 2518 return 0; 2519 } 2520 2521 subsys_initcall(pktsched_init); 2522